From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:36:59 -0700
Subject: mm: thp: tail page refcounting fix

Michel while working on the working set estimation code, noticed that
calling get_page_unless_zero() on a random pfn_to_page(random_pfn)
wasn't safe, if the pfn ended up being a tail page of a transparent
hugepage under splitting by __split_huge_page_refcount().

He then found the problem could also theoretically materialize with
page_cache_get_speculative() during the speculative radix tree lookups
that uses get_page_unless_zero() in SMP if the radix tree page is freed
and reallocated and get_user_pages is called on it before
page_cache_get_speculative has a chance to call get_page_unless_zero().

So the best way to fix the problem is to keep page_tail->_count zero at
all times.  This will guarantee that get_page_unless_zero() can never
succeed on any tail page.  page_tail->_mapcount is guaranteed zero and
is unused for all tail pages of a compound page, so we can simply
account the tail page references there and transfer them to
tail_page->_count in __split_huge_page_refcount() (in addition to the
head_page->_mapcount).

While debugging this s/_count/_mapcount/ change I also noticed get_page is
called by direct-io.c on pages returned by get_user_pages.  That wasn't
entirely safe because the two atomic_inc in get_page weren't atomic.  As
opposed to other get_user_page users like secondary-MMU page fault to
establish the shadow pagetables would never call any superflous get_page
after get_user_page returns.  It's safer to make get_page universally safe
for tail pages and to use get_page_foll() within follow_page (inside
get_user_pages()).  get_page_foll() is safe to do the refcounting for tail
pages without taking any locks because it is run within PT lock protected
critical sections (PT lock for pte and page_table_lock for
pmd_trans_huge).

The standard get_page() as invoked by direct-io instead will now take
the compound_lock but still only for tail pages.  The direct-io paths
are usually I/O bound and the compound_lock is per THP so very
finegrined, so there's no risk of scalability issues with it.  A simple
direct-io benchmarks with all lockdep prove locking and spinlock
debugging infrastructure enabled shows identical performance and no
overhead.  So it's worth it.  Ideally direct-io should stop calling
get_page() on pages returned by get_user_pages().  The spinlock in
get_page() is already optimized away for no-THP builds but doing
get_page() on tail pages returned by GUP is generally a rare operation
and usually only run in I/O paths.

This new refcounting on page_tail->_mapcount in addition to avoiding new
RCU critical sections will also allow the working set estimation code to
work without any further complexity associated to the tail page
refcounting with THP.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index fec1320..b9e1c7f 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page)
 	 * __split_huge_page_refcount() cannot run
 	 * from under us.
 	 */
-	VM_BUG_ON(atomic_read(&page->_count) < 0);
-	atomic_inc(&page->_count);
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
 }
 
 /*
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index dbe34b9..3b5032a 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page)
 	 * __split_huge_page_refcount() cannot run
 	 * from under us.
 	 */
-	VM_BUG_ON(atomic_read(&page->_count) < 0);
-	atomic_inc(&page->_count);
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
 }
 
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3b3e3b8..f81b7b4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page)
 	return page;
 }
 
+/*
+ * The atomic page->_mapcount, starts from -1: so that transitions
+ * both from it and to it can be tracked, using atomic_inc_and_test
+ * and atomic_add_negative(-1).
+ */
+static inline void reset_page_mapcount(struct page *page)
+{
+	atomic_set(&(page)->_mapcount, -1);
+}
+
+static inline int page_mapcount(struct page *page)
+{
+	return atomic_read(&(page)->_mapcount) + 1;
+}
+
 static inline int page_count(struct page *page)
 {
 	return atomic_read(&compound_head(page)->_count);
 }
 
+extern bool __get_page_tail(struct page *page);
+
 static inline void get_page(struct page *page)
 {
+	if (unlikely(PageTail(page)))
+		if (likely(__get_page_tail(page)))
+			return;
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count. Only if
-	 * we're getting a tail page, the elevated page->_count is
-	 * required only in the head page, so for tail pages the
-	 * bugcheck only verifies that the page->_count isn't
-	 * negative.
+	 * requires to already have an elevated page->_count.
 	 */
-	VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
+	VM_BUG_ON(atomic_read(&page->_count) <= 0);
 	atomic_inc(&page->_count);
-	/*
-	 * Getting a tail page will elevate both the head and tail
-	 * page->_count(s).
-	 */
-	if (unlikely(PageTail(page))) {
-		/*
-		 * This is safe only because
-		 * __split_huge_page_refcount can't run under
-		 * get_page().
-		 */
-		VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
-		atomic_inc(&page->first_page->_count);
-	}
 }
 
 static inline struct page *virt_to_head_page(const void *x)
@@ -804,21 +807,6 @@ static inline pgoff_t page_index(struct page *page)
 }
 
 /*
- * The atomic page->_mapcount, like _count, starts from -1:
- * so that transitions both from it and to it can be tracked,
- * using atomic_inc_and_test and atomic_add_negative(-1).
- */
-static inline void reset_page_mapcount(struct page *page)
-{
-	atomic_set(&(page)->_mapcount, -1);
-}
-
-static inline int page_mapcount(struct page *page)
-{
-	return atomic_read(&(page)->_mapcount) + 1;
-}
-
-/*
  * Return true if this page is mapped into pagetables.
  */
 static inline int page_mapped(struct page *page)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3e01a19..5b42f1b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -62,10 +62,23 @@ struct page {
 			struct {
 
 				union {
-					atomic_t _mapcount;	/* Count of ptes mapped in mms,
-							 * to show when page is mapped
-							 * & limit reverse map searches.
-							 */
+					/*
+					 * Count of ptes mapped in
+					 * mms, to show when page is
+					 * mapped & limit reverse map
+					 * searches.
+					 *
+					 * Used also for tail pages
+					 * refcounting instead of
+					 * _count. Tail pages cannot
+					 * be mapped and keeping the
+					 * tail page _count zero at
+					 * all times guarantees
+					 * get_page_unless_zero() will
+					 * never succeed on tail
+					 * pages.
+					 */
+					atomic_t _mapcount;
 
 					struct {
 						unsigned inuse:16;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 860ec21..4298aba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON(!PageCompound(page));
 	if (flags & FOLL_GET)
-		get_page(page);
+		get_page_foll(page);
 
 out:
 	return page;
@@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
 	unsigned long head_index = page->index;
 	struct zone *zone = page_zone(page);
 	int zonestat;
+	int tail_count = 0;
 
 	/* prevent PageLRU to go away from under us, and freeze lru stats */
 	spin_lock_irq(&zone->lru_lock);
@@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		struct page *page_tail = page + i;
 
-		/* tail_page->_count cannot change */
-		atomic_sub(atomic_read(&page_tail->_count), &page->_count);
-		BUG_ON(page_count(page) <= 0);
-		atomic_add(page_mapcount(page) + 1, &page_tail->_count);
-		BUG_ON(atomic_read(&page_tail->_count) <= 0);
+		/* tail_page->_mapcount cannot change */
+		BUG_ON(page_mapcount(page_tail) < 0);
+		tail_count += page_mapcount(page_tail);
+		/* check for overflow */
+		BUG_ON(tail_count < 0);
+		BUG_ON(atomic_read(&page_tail->_count) != 0);
+		/*
+		 * tail_page->_count is zero and not changing from
+		 * under us. But get_page_unless_zero() may be running
+		 * from under us on the tail_page. If we used
+		 * atomic_set() below instead of atomic_add(), we
+		 * would then run atomic_set() concurrently with
+		 * get_page_unless_zero(), and atomic_set() is
+		 * implemented in C not using locked ops. spin_unlock
+		 * on x86 sometime uses locked ops because of PPro
+		 * errata 66, 92, so unless somebody can guarantee
+		 * atomic_set() here would be safe on all archs (and
+		 * not only on x86), it's safer to use atomic_add().
+		 */
+		atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+			   &page_tail->_count);
 
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb();
@@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
 				      (1L << PG_uptodate)));
 		page_tail->flags |= (1L << PG_dirty);
 
-		/*
-		 * 1) clear PageTail before overwriting first_page
-		 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-		 */
+		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 
 		/*
@@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
 		 * status is achieved setting a reserved bit in the
 		 * pmd, not by clearing the present bit.
 		*/
-		BUG_ON(page_mapcount(page_tail));
 		page_tail->_mapcount = page->_mapcount;
 
 		BUG_ON(page_tail->mapping);
@@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
 
 		lru_add_page_tail(zone, page, page_tail);
 	}
+	atomic_sub(tail_count, &page->_count);
+	BUG_ON(atomic_read(&page->_count) <= 0);
 
 	__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
diff --git a/mm/internal.h b/mm/internal.h
index d071d38..2189af4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
+static inline void __get_page_tail_foll(struct page *page,
+					bool get_page_head)
+{
+	/*
+	 * If we're getting a tail page, the elevated page->_count is
+	 * required only in the head page and we will elevate the head
+	 * page->_count and tail page->_mapcount.
+	 *
+	 * We elevate page_tail->_mapcount for tail pages to force
+	 * page_tail->_count to be zero at all times to avoid getting
+	 * false positives from get_page_unless_zero() with
+	 * speculative page access (like in
+	 * page_cache_get_speculative()) on tail pages.
+	 */
+	VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	VM_BUG_ON(page_mapcount(page) < 0);
+	if (get_page_head)
+		atomic_inc(&page->first_page->_count);
+	atomic_inc(&page->_mapcount);
+}
+
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+	if (unlikely(PageTail(page)))
+		/*
+		 * This is safe only because
+		 * __split_huge_page_refcount() can't run under
+		 * get_page_foll() because we hold the proper PT lock.
+		 */
+		__get_page_tail_foll(page, true);
+	else {
+		/*
+		 * Getting a normal page or the head of a compound page
+		 * requires to already have an elevated page->_count.
+		 */
+		VM_BUG_ON(atomic_read(&page->_count) <= 0);
+		atomic_inc(&page->_count);
+	}
+}
+
 extern unsigned long highest_memmap_pfn;
 
 /*
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba..b2b8731 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1503,7 +1503,7 @@ split_fallthrough:
 	}
 
 	if (flags & FOLL_GET)
-		get_page(page);
+		get_page_foll(page);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f1..87627f1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
 {
 	if (unlikely(PageTail(page))) {
 		/* __split_huge_page_refcount can run under us */
-		struct page *page_head = page->first_page;
-		smp_rmb();
-		/*
-		 * If PageTail is still set after smp_rmb() we can be sure
-		 * that the page->first_page we read wasn't a dangling pointer.
-		 * See __split_huge_page_refcount() smp_wmb().
-		 */
-		if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+		struct page *page_head = compound_trans_head(page);
+
+		if (likely(page != page_head &&
+			   get_page_unless_zero(page_head))) {
 			unsigned long flags;
 			/*
-			 * Verify that our page_head wasn't converted
-			 * to a a regular page before we got a
-			 * reference on it.
+			 * page_head wasn't a dangling pointer but it
+			 * may not be a head page anymore by the time
+			 * we obtain the lock. That is ok as long as it
+			 * can't be freed from under us.
 			 */
-			if (unlikely(!PageHead(page_head))) {
-				/* PageHead is cleared after PageTail */
-				smp_rmb();
-				VM_BUG_ON(PageTail(page));
-				goto out_put_head;
-			}
-			/*
-			 * Only run compound_lock on a valid PageHead,
-			 * after having it pinned with
-			 * get_page_unless_zero() above.
-			 */
-			smp_mb();
-			/* page_head wasn't a dangling pointer */
 			flags = compound_lock_irqsave(page_head);
 			if (unlikely(!PageTail(page))) {
 				/* __split_huge_page_refcount run before us */
 				compound_unlock_irqrestore(page_head, flags);
 				VM_BUG_ON(PageHead(page_head));
-			out_put_head:
 				if (put_page_testzero(page_head))
 					__put_single_page(page_head);
 			out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
 			VM_BUG_ON(page_head != page->first_page);
 			/*
 			 * We can release the refcount taken by
-			 * get_page_unless_zero now that
-			 * split_huge_page_refcount is blocked on the
-			 * compound_lock.
+			 * get_page_unless_zero() now that
+			 * __split_huge_page_refcount() is blocked on
+			 * the compound_lock.
 			 */
 			if (put_page_testzero(page_head))
 				VM_BUG_ON(1);
 			/* __split_huge_page_refcount will wait now */
-			VM_BUG_ON(atomic_read(&page->_count) <= 0);
-			atomic_dec(&page->_count);
+			VM_BUG_ON(page_mapcount(page) <= 0);
+			atomic_dec(&page->_mapcount);
 			VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+			VM_BUG_ON(atomic_read(&page->_count) != 0);
 			compound_unlock_irqrestore(page_head, flags);
 			if (put_page_testzero(page_head)) {
 				if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
 
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+	/*
+	 * This takes care of get_page() if run on a tail page
+	 * returned by one of the get_user_pages/follow_page variants.
+	 * get_user_pages/follow_page itself doesn't need the compound
+	 * lock because it runs __get_page_tail_foll() under the
+	 * proper PT lock that already serializes against
+	 * split_huge_page().
+	 */
+	unsigned long flags;
+	bool got = false;
+	struct page *page_head = compound_trans_head(page);
+
+	if (likely(page != page_head && get_page_unless_zero(page_head))) {
+		/*
+		 * page_head wasn't a dangling pointer but it
+		 * may not be a head page anymore by the time
+		 * we obtain the lock. That is ok as long as it
+		 * can't be freed from under us.
+		 */
+		flags = compound_lock_irqsave(page_head);
+		/* here __split_huge_page_refcount won't run anymore */
+		if (likely(PageTail(page))) {
+			__get_page_tail_foll(page, false);
+			got = true;
+		}
+		compound_unlock_irqrestore(page_head, flags);
+		if (unlikely(!got))
+			put_page(page_head);
+	}
+	return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
+
 /**
  * put_pages_list() - release a list of pages
  * @pages: list of pages threaded on page->lru
-- 
cgit v0.10.2


From 2839bdc1bfc0af76a2f0f11eca011590520a04fa Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:03 -0700
Subject: powerpc: remove superfluous PageTail checks on the pte gup_fast

This part of gup_fast doesn't seem capable of handling hugetlbfs ptes,
those should be handled by gup_hugepd only, so these checks are
superfluous.

Plus if this wasn't a noop, it would have oopsed because, the insistence
of using the speculative refcounting would trigger a VM_BUG_ON if a tail
page was encountered in the page_cache_get_speculative().

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index b9e1c7f..d7efdbf 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -16,17 +16,6 @@
 
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -58,8 +47,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 			put_page(page);
 			return 0;
 		}
-		if (PageTail(page))
-			get_huge_page_tail(page);
 		pages[*nr] = page;
 		(*nr)++;
 
-- 
cgit v0.10.2


From 405e44f2e312dd5dd63e5a9f459bffcbcd4368ef Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:08 -0700
Subject: powerpc: get_hugepte() don't put_page() the wrong page

"page" may have changed to point to the next hugepage after the loop
completed, The references have been taken on the head page, so the
put_page must happen there too.

This is a longstanding issue pre-thp inclusion.

It's totally unclear how these page_cache_add_speculative and
pte_val(pte) != pte_val(*ptep) checks are necessary across all the
powerpc gup_fast code, when x86 doesn't need any of that: there's no way
the page can be freed with irq disabled so we're guaranteed the
atomic_inc will happen on a page with page_count > 0 (so not needing the
speculative check).

The pte check is also meaningless on x86: no need to rollback on x86 if
the pte changed, because the pte can still change a CPU tick after the
check succeeded and it won't be rolled back in that case.  The important
thing is we got a reference on a valid page that was mapped there a CPU
tick ago.  So not knowing the soft tlb refill code of ppc64 in great
detail I'm not removing the "speculative" page_count increase and the
pte checks across all the code, but unless there's a strong reason for
it they should be later cleaned up too.

If a pte can change from huge to non-huge (like it could happen with
THP) passing a pte_t *ptep to gup_hugepte() would also require to repeat
the is_hugepd in gup_hugepte(), but that shouldn't happen with hugetlbfs
only so I'm not altering that.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1..b649c28 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -429,7 +429,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add
 	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 		/* Could be optimized better */
 		while (*nr) {
-			put_page(page);
+			put_page(head);
 			(*nr)--;
 		}
 	}
-- 
cgit v0.10.2


From 8596468487e2062cae2aad56e973784e03959245 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:11 -0700
Subject: powerpc: gup_hugepte() avoid freeing the head page too many times

We only taken "refs" pins on the head page not "*nr" pins.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index b649c28..78b14ab 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -428,10 +428,9 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add
 
 	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
 		/* Could be optimized better */
-		while (*nr) {
+		*nr -= refs;
+		while (refs--)
 			put_page(head);
-			(*nr)--;
-		}
 	}
 
 	return 1;
-- 
cgit v0.10.2


From 3526741f0964c88bc2ce511e1078359052bf225b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:15 -0700
Subject: powerpc: gup_hugepte() support THP based tail recounting

Up to this point the code assumed old refcounting for hugepages (pre-thp).
This updates the code directly to the thp mapcount tail page refcounting.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 78b14ab..a618ef0 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -385,12 +385,23 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
+}
+
 static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		       unsigned long end, int write, struct page **pages, int *nr)
 {
 	unsigned long mask;
 	unsigned long pte_end;
-	struct page *head, *page;
+	struct page *head, *page, *tail;
 	pte_t pte;
 	int refs;
 
@@ -413,6 +424,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add
 	head = pte_page(pte);
 
 	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+	tail = page;
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
@@ -431,6 +443,16 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add
 		*nr -= refs;
 		while (refs--)
 			put_page(head);
+	} else {
+		/*
+		 * Any tail page need their mapcount reference taken
+		 * before we return.
+		 */
+		while (refs--) {
+			if (PageTail(tail))
+				get_huge_page_tail(tail);
+			tail++;
+		}
 	}
 
 	return 1;
-- 
cgit v0.10.2


From cf592bf768c4fa40282b8fce58a80820065de2cb Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:19 -0700
Subject: powerpc: gup_huge_pmd() return 0 if pte changes

powerpc didn't return 0 in that case, if it's rolling back the *nr pointer
it should also return zero to avoid adding pages to the array at the wrong
offset.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a618ef0..1c59d94 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -443,16 +443,17 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add
 		*nr -= refs;
 		while (refs--)
 			put_page(head);
-	} else {
-		/*
-		 * Any tail page need their mapcount reference taken
-		 * before we return.
-		 */
-		while (refs--) {
-			if (PageTail(tail))
-				get_huge_page_tail(tail);
-			tail++;
-		}
+		return 0;
+	}
+
+	/*
+	 * Any tail page need their mapcount reference taken before we
+	 * return.
+	 */
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
 	}
 
 	return 1;
-- 
cgit v0.10.2


From 220a2eb228d032acde60e9fd044ca802706ff583 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:25 -0700
Subject: s390: gup_huge_pmd() support THP tail recounting

Up to this point the code assumed old refcounting for hugepages (pre-thp).
This updates the code directly to the thp mapcount tail page refcounting.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 45b405c..668dda9 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -48,11 +48,22 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 	return 1;
 }
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
+}
+
 static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
 	unsigned long mask, result;
-	struct page *head, *page;
+	struct page *head, *page, *tail;
 	int refs;
 
 	result = write ? 0 : _SEGMENT_ENTRY_RO;
@@ -64,6 +75,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 	refs = 0;
 	head = pmd_page(pmd);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	tail = page;
 	do {
 		VM_BUG_ON(compound_head(page) != head);
 		pages[*nr] = page;
@@ -81,6 +93,16 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 		*nr -= refs;
 		while (refs--)
 			put_page(head);
+	} else {
+		/*
+		 * Any tail page need their mapcount reference taken
+		 * before we return.
+		 */
+		while (refs--) {
+			if (PageTail(tail))
+				get_huge_page_tail(tail);
+			tail++;
+		}
 	}
 
 	return 1;
-- 
cgit v0.10.2


From 0693bc9ce2cc4f6a1b9c3c05790fc149a74c0b87 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:28 -0700
Subject: s390: gup_huge_pmd() return 0 if pte changes

s390 didn't return 0 in that case, if it's rolling back the *nr pointer it
should also return zero to avoid adding pages to the array at the wrong
offset.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 668dda9..da33a02 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -93,16 +93,17 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 		*nr -= refs;
 		while (refs--)
 			put_page(head);
-	} else {
-		/*
-		 * Any tail page need their mapcount reference taken
-		 * before we return.
-		 */
-		while (refs--) {
-			if (PageTail(tail))
-				get_huge_page_tail(tail);
-			tail++;
-		}
+		return 0;
+	}
+
+	/*
+	 * Any tail page need their mapcount reference taken before we
+	 * return.
+	 */
+	while (refs--) {
+		if (PageTail(tail))
+			get_huge_page_tail(tail);
+		tail++;
 	}
 
 	return 1;
-- 
cgit v0.10.2


From e0d85a366c2300efd230ef82a9b22110b0658331 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:31 -0700
Subject: sparc: gup_pte_range() support THP based tail recounting

Up to this point the code assumed old refcounting for hugepages (pre-thp).
 This updates the code directly to the thp mapcount tail page refcounting.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index a986b5d..afcebac 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -12,6 +12,17 @@
 #include <linux/rwsem.h>
 #include <asm/pgtable.h>
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
+}
+
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
@@ -56,6 +67,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 			put_page(head);
 			return 0;
 		}
+		if (head != page)
+			get_huge_page_tail(page);
 
 		pages[*nr] = page;
 		(*nr)++;
-- 
cgit v0.10.2


From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 2 Nov 2011 13:37:36 -0700
Subject: thp: share get_huge_page_tail()

This avoids duplicating the function in every arch gup_fast.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1c59d94..da5eb38 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -385,17 +385,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		       unsigned long end, int write, struct page **pages, int *nr)
 {
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index da33a02..65cb06e 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -48,17 +48,6 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 	return 1;
 }
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index afcebac..42c55df 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -12,17 +12,6 @@
 #include <linux/rwsem.h>
 #include <asm/pgtable.h>
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 /*
  * The performance critical leaf functions are made noinline otherwise gcc
  * inlines everything into a single function which results in too much
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 3b5032a..ea30585 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr)
 	SetPageReferenced(page);
 }
 
-static inline void get_huge_page_tail(struct page *page)
-{
-	/*
-	 * __split_huge_page_refcount() cannot run
-	 * from under us.
-	 */
-	VM_BUG_ON(page_mapcount(page) < 0);
-	VM_BUG_ON(atomic_read(&page->_count) != 0);
-	atomic_inc(&page->_mapcount);
-}
-
 static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f81b7b4..3dc3a8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -376,6 +376,17 @@ static inline int page_count(struct page *page)
 	return atomic_read(&compound_head(page)->_count);
 }
 
+static inline void get_huge_page_tail(struct page *page)
+{
+	/*
+	 * __split_huge_page_refcount() cannot run
+	 * from under us.
+	 */
+	VM_BUG_ON(page_mapcount(page) < 0);
+	VM_BUG_ON(atomic_read(&page->_count) != 0);
+	atomic_inc(&page->_mapcount);
+}
+
 extern bool __get_page_tail(struct page *page);
 
 static inline void get_page(struct page *page)
-- 
cgit v0.10.2


From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Wed, 2 Nov 2011 13:37:41 -0700
Subject: binfmt_elf: fix PIE execution with randomization disabled

The case of address space randomization being disabled in runtime through
randomize_va_space sysctl is not treated properly in load_elf_binary(),
resulting in SIGKILL coming at exec() time for certain PIE-linked binaries
in case the randomization has been disabled at runtime prior to calling
exec().

Handle the randomize_va_space == 0 case the same way as if we were not
supporting .text randomization at all.

Based on original patch by H.J. Lu and Josh Boyer.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: H.J. Lu <hongjiu.lu@intel.com>
Cc: <stable@kernel.org>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc..21ac5ee 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
 #if defined(CONFIG_X86) || defined(CONFIG_ARM)
-			load_bias = 0;
+			/* Memory randomization might have been switched off
+			 * in runtime via sysctl.
+			 * If that is the case, retain the original non-zero
+			 * load_bias value in order to establish proper
+			 * non-randomized mappings.
+			 */
+			if (current->flags & PF_RANDOMIZE)
+				load_bias = 0;
+			else
+				load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #else
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 #endif
-- 
cgit v0.10.2


From 0620d9193cb976ba635d56a6cfd11cb81616d02b Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 2 Nov 2011 13:37:45 -0700
Subject: ramfs: remove module leftovers

Since ramfs is hard-selected to "y", the module leftovers make no sense.

Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eacb166..462ceb3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -23,7 +23,6 @@
  * caches is sufficient.
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void)
 {
 	return register_filesystem(&ramfs_fs_type);
 }
-
-static void __exit exit_ramfs_fs(void)
-{
-	unregister_filesystem(&ramfs_fs_type);
-}
-
 module_init(init_ramfs_fs)
-module_exit(exit_ramfs_fs)
 
 int __init init_rootfs(void)
 {
@@ -311,5 +303,3 @@ int __init init_rootfs(void)
 
 	return err;
 }
-
-MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From f919b9235f930e649b374a50009c6c268bd9a073 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@neotion.com>
Date: Wed, 2 Nov 2011 13:37:47 -0700
Subject: init/do_mounts_rd.c: fix ramdisk identification for padded cramfs

When a cramfs ramdisk padded with 512 bytes is given to the kernel, the
current identify_ramdisk_image function fails to identify it.

Tested with a padded cramfs image on an ARM based board.

Signed-off-by: Neil Armstrong <narmstrong@neotion.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Davidlohr Bueso <dave@gnu.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index fe9acb0..887629e 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -120,6 +120,20 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor)
 	}
 
 	/*
+	 * Read 512 bytes further to check if cramfs is padded
+	 */
+	sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0);
+	sys_read(fd, buf, size);
+
+	if (cramfsb->magic == CRAMFS_MAGIC) {
+		printk(KERN_NOTICE
+		       "RAMDISK: cramfs filesystem found at block %d\n",
+		       start_block);
+		nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS;
+		goto done;
+	}
+
+	/*
 	 * Read block 1 to test for minix and ext2 superblock
 	 */
 	sys_lseek(fd, (start_block+1) * BLOCK_SIZE, 0);
-- 
cgit v0.10.2


From 6d03d06db8881f4f9da87d5c77234b98c40a30e9 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Wed, 2 Nov 2011 13:37:49 -0700
Subject: drivers/rtc/class.c: convert idr to ida and use ida_simple_get()

This is the one use of an ida that doesn't retry on receiving -EAGAIN.
I'm assuming do so will cause no harm and may help on a rare occasion.

Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 01a7df5..e8326f2 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -21,16 +21,13 @@
 #include "rtc-core.h"
 
 
-static DEFINE_IDR(rtc_idr);
-static DEFINE_MUTEX(idr_lock);
+static DEFINE_IDA(rtc_ida);
 struct class *rtc_class;
 
 static void rtc_device_release(struct device *dev)
 {
 	struct rtc_device *rtc = to_rtc_device(dev);
-	mutex_lock(&idr_lock);
-	idr_remove(&rtc_idr, rtc->id);
-	mutex_unlock(&idr_lock);
+	ida_simple_remove(&rtc_ida, rtc->id);
 	kfree(rtc);
 }
 
@@ -146,25 +143,16 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 	struct rtc_wkalrm alrm;
 	int id, err;
 
-	if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) {
-		err = -ENOMEM;
+	id = ida_simple_get(&rtc_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		err = id;
 		goto exit;
 	}
 
-
-	mutex_lock(&idr_lock);
-	err = idr_get_new(&rtc_idr, NULL, &id);
-	mutex_unlock(&idr_lock);
-
-	if (err < 0)
-		goto exit;
-
-	id = id & MAX_ID_MASK;
-
 	rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL);
 	if (rtc == NULL) {
 		err = -ENOMEM;
-		goto exit_idr;
+		goto exit_ida;
 	}
 
 	rtc->id = id;
@@ -222,10 +210,8 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 exit_kfree:
 	kfree(rtc);
 
-exit_idr:
-	mutex_lock(&idr_lock);
-	idr_remove(&rtc_idr, id);
-	mutex_unlock(&idr_lock);
+exit_ida:
+	ida_simple_remove(&rtc_ida, id);
 
 exit:
 	dev_err(dev, "rtc core: unable to register %s, err = %d\n",
@@ -276,7 +262,7 @@ static void __exit rtc_exit(void)
 {
 	rtc_dev_exit();
 	class_destroy(rtc_class);
-	idr_destroy(&rtc_idr);
+	ida_destroy(&rtc_ida);
 }
 
 subsys_initcall(rtc_init);
-- 
cgit v0.10.2


From 43fcb81550f7a16be192b19c77a379c9b27b1585 Mon Sep 17 00:00:00 2001
From: David Anders <danders.dev@gmail.com>
Date: Wed, 2 Nov 2011 13:37:53 -0700
Subject: rtc: add initial support for mcp7941x parts

Add initial support for the microchip mcp7941x series of real time clocks.

The mcp7941x series is generally compatible with the ds1307 and ds1337 rtc
devices from dallas semiconductor.  minor differences include a backup
battery enable bit, and the polarity of the oscillator enable bit.

Signed-off-by: David Anders <danders.dev@gmail.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Reviewed-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index b2005b4..62b0763 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -34,6 +34,7 @@ enum ds_type {
 	ds_1388,
 	ds_3231,
 	m41t00,
+	mcp7941x,
 	rx_8025,
 	// rs5c372 too?  different address...
 };
@@ -43,6 +44,7 @@ enum ds_type {
 #define DS1307_REG_SECS		0x00	/* 00-59 */
 #	define DS1307_BIT_CH		0x80
 #	define DS1340_BIT_nEOSC		0x80
+#	define MCP7941X_BIT_ST		0x80
 #define DS1307_REG_MIN		0x01	/* 00-59 */
 #define DS1307_REG_HOUR		0x02	/* 00-23, or 1-12{am,pm} */
 #	define DS1307_BIT_12HR		0x40	/* in REG_HOUR */
@@ -50,6 +52,7 @@ enum ds_type {
 #	define DS1340_BIT_CENTURY_EN	0x80	/* in REG_HOUR */
 #	define DS1340_BIT_CENTURY	0x40	/* in REG_HOUR */
 #define DS1307_REG_WDAY		0x03	/* 01-07 */
+#	define MCP7941X_BIT_VBATEN	0x08
 #define DS1307_REG_MDAY		0x04	/* 01-31 */
 #define DS1307_REG_MONTH	0x05	/* 01-12 */
 #	define DS1337_BIT_CENTURY	0x80	/* in REG_MONTH */
@@ -137,6 +140,8 @@ static const struct chip_desc chips[] = {
 },
 [m41t00] = {
 },
+[mcp7941x] = {
+},
 [rx_8025] = {
 }, };
 
@@ -149,6 +154,7 @@ static const struct i2c_device_id ds1307_id[] = {
 	{ "ds1340", ds_1340 },
 	{ "ds3231", ds_3231 },
 	{ "m41t00", m41t00 },
+	{ "mcp7941x", mcp7941x },
 	{ "pt7c4338", ds_1307 },
 	{ "rx8025", rx_8025 },
 	{ }
@@ -365,6 +371,10 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
 		buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN
 				| DS1340_BIT_CENTURY;
 		break;
+	case mcp7941x:
+		buf[DS1307_REG_SECS] |= MCP7941X_BIT_ST;
+		buf[DS1307_REG_WDAY] |= MCP7941X_BIT_VBATEN;
+		break;
 	default:
 		break;
 	}
@@ -809,6 +819,23 @@ read_rtc:
 			dev_warn(&client->dev, "SET TIME!\n");
 		}
 		break;
+	case mcp7941x:
+		/* make sure that the backup battery is enabled */
+		if (!(ds1307->regs[DS1307_REG_WDAY] & MCP7941X_BIT_VBATEN)) {
+			i2c_smbus_write_byte_data(client, DS1307_REG_WDAY,
+					ds1307->regs[DS1307_REG_WDAY]
+					| MCP7941X_BIT_VBATEN);
+		}
+
+		/* clock halted?  turn it on, so clock can tick. */
+		if (!(tmp & MCP7941X_BIT_ST)) {
+			i2c_smbus_write_byte_data(client, DS1307_REG_SECS,
+					MCP7941X_BIT_ST);
+			dev_warn(&client->dev, "SET TIME!\n");
+			goto read_rtc;
+		}
+
+		break;
 	case rx_8025:
 	case ds_1337:
 	case ds_1339:
-- 
cgit v0.10.2


From db5cf8d1ac4ac3fa06d89345154ce20068aeb097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 2 Nov 2011 13:37:56 -0700
Subject: drivers/rtc/rtc-mc13xxx.c: move probe and remove callbacks to
 .init.text and .exit.text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver is added using platform_driver_probe(), so the callbacks can be
discarded more aggessively.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index a1a278b..9d0c3b4 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -309,7 +309,7 @@ static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static int __devinit mc13xxx_rtc_probe(struct platform_device *pdev)
+static int __init mc13xxx_rtc_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct mc13xxx_rtc *priv;
@@ -378,7 +378,7 @@ err_reset_irq_request:
 	return ret;
 }
 
-static int __devexit mc13xxx_rtc_remove(struct platform_device *pdev)
+static int __exit mc13xxx_rtc_remove(struct platform_device *pdev)
 {
 	struct mc13xxx_rtc *priv = platform_get_drvdata(pdev);
 
@@ -410,7 +410,7 @@ const struct platform_device_id mc13xxx_rtc_idtable[] = {
 
 static struct platform_driver mc13xxx_rtc_driver = {
 	.id_table = mc13xxx_rtc_idtable,
-	.remove = __devexit_p(mc13xxx_rtc_remove),
+	.remove = __exit_p(mc13xxx_rtc_remove),
 	.driver = {
 		.name = DRIVER_NAME,
 		.owner = THIS_MODULE,
-- 
cgit v0.10.2


From b6eb48d02dc73d19bebc396a9e92dd64a65d3199 Mon Sep 17 00:00:00 2001
From: Sami Kerola <kerolasa@iki.fi>
Date: Wed, 2 Nov 2011 13:37:58 -0700
Subject: minix: describe usage of different magic numbers

One can get this information from minix/inode.c, but adding the
explanations at the definition sites is more appropriate.

Signed-off-by: Sami Kerola <kerolasa@iki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/magic.h b/include/linux/magic.h
index 1e5df2a..2d4beab 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -30,11 +30,11 @@
 #define ANON_INODE_FS_MAGIC	0x09041934
 #define PSTOREFS_MAGIC		0x6165676C
 
-#define MINIX_SUPER_MAGIC	0x137F		/* original minix fs */
-#define MINIX_SUPER_MAGIC2	0x138F		/* minix fs, 30 char names */
-#define MINIX2_SUPER_MAGIC	0x2468		/* minix V2 fs */
-#define MINIX2_SUPER_MAGIC2	0x2478		/* minix V2 fs, 30 char names */
-#define MINIX3_SUPER_MAGIC	0x4d5a		/* minix V3 fs */
+#define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
+#define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
+#define MINIX2_SUPER_MAGIC	0x2468		/* minix v2 fs, 14 char names */
+#define MINIX2_SUPER_MAGIC2	0x2478		/* minix v2 fs, 30 char names */
+#define MINIX3_SUPER_MAGIC	0x4d5a		/* minix v3 fs, 60 char names */
 
 #define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
 #define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
-- 
cgit v0.10.2


From 3069083cc8def2ffad8520f0f24c6f95f140aac5 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Wed, 2 Nov 2011 13:38:00 -0700
Subject: isofs: add readpages support

Use mpage_readpages() instead of multiple calls to isofs_readpage() to
reduce the CPU utilization and make performance higher.

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a5d0367..46844ff 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -20,6 +20,7 @@
 #include <linux/statfs.h>
 #include <linux/cdrom.h>
 #include <linux/parser.h>
+#include <linux/mpage.h>
 
 #include "isofs.h"
 #include "zisofs.h"
@@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block)
 
 static int isofs_readpage(struct file *file, struct page *page)
 {
-	return block_read_full_page(page,isofs_get_block);
+	return mpage_readpage(page, isofs_get_block);
+}
+
+static int isofs_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
 }
 
 static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
@@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 
 static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
+	.readpages = isofs_readpages,
 	.bmap = _isofs_bmap
 };
 
-- 
cgit v0.10.2


From 434a964daa14b9db083ce20404a4a2add54d037a Mon Sep 17 00:00:00 2001
From: Phillip Lougher <plougher@redhat.com>
Date: Wed, 2 Nov 2011 13:38:01 -0700
Subject: hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops

Clement Lecigne reports a filesystem which causes a kernel oops in
hfs_find_init() trying to dereference sb->ext_tree which is NULL.

This proves to be because the filesystem has a corrupted MDB extent
record, where the extents file does not fit into the first three extents
in the file record (the first blocks).

In hfs_get_block() when looking up the blocks for the extent file
(HFS_EXT_CNID), it fails the first blocks special case, and falls
through to the extent code (which ultimately calls hfs_find_init())
which is in the process of being initialised.

Hfs avoids this scenario by always having the extents b-tree fitting
into the first blocks (the extents B-tree can't have overflow extents).

The fix is to check at mount time that the B-tree fits into first
blocks, i.e.  fail if HFS_I(inode)->alloc_blocks >=
HFS_I(inode)->first_blocks

Note, the existing commit 47f365eb57573 ("hfs: fix oops on mount with
corrupted btree extent records") becomes subsumed into this as a special
case, but only for the extents B-tree (HFS_EXT_CNID), it is perfectly
acceptable for the catalog B-Tree file to grow beyond three extents,
with the remaining extent descriptors in the extents overfow.

This fixes CVE-2011-2203

Reported-by: Clement LECIGNE <clement.lecigne@netasq.com>
Signed-off-by: Phillip Lougher <plougher@redhat.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 3ebc437..1cbdeea 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	case HFS_EXT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
 				    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
+		if (HFS_I(tree->inode)->alloc_blocks >
+					HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
 		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
 		break;
 	case HFS_CAT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize,
 				    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
+
+		if (!HFS_I(tree->inode)->first_blocks) {
+			printk(KERN_ERR "hfs: invalid btree extent records "
+								"(0 size).\n");
+			unlock_new_inode(tree->inode);
+			goto free_inode;
+		}
+
 		tree->inode->i_mapping->a_ops = &hfs_btree_aops;
 		break;
 	default:
@@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	}
 	unlock_new_inode(tree->inode);
 
-	if (!HFS_I(tree->inode)->first_blocks) {
-		printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
-		goto free_inode;
-	}
-
 	mapping = tree->inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
-- 
cgit v0.10.2


From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Wed, 2 Nov 2011 13:38:05 -0700
Subject: cgroups: more safe tasklist locking in cgroup_attach_proc

Fix unstable tasklist locking in cgroup_attach_proc.

According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is
not sufficient to guarantee the tasklist is stable w.r.t.  de_thread and
exit.  Taking tasklist_lock for reading, instead of rcu_read_lock, ensures
proper exclusion.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 453100a..64b0e73 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		goto out_free_group_list;
 
 	/* prevent changes to the threadgroup list while we take a snapshot. */
-	rcu_read_lock();
+	read_lock(&tasklist_lock);
 	if (!thread_group_leader(leader)) {
 		/*
 		 * a race with de_thread from another thread's exec() may strip
@@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		 * throw this task away and try again (from cgroup_procs_write);
 		 * this is "double-double-toil-and-trouble-check locking".
 		 */
-		rcu_read_unlock();
+		read_unlock(&tasklist_lock);
 		retval = -EAGAIN;
 		goto out_free_group_list;
 	}
@@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	} while_each_thread(leader, tsk);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
-	rcu_read_unlock();
+	read_unlock(&tasklist_lock);
 
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
-- 
cgit v0.10.2


From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Wed, 2 Nov 2011 13:38:07 -0700
Subject: cgroups: don't attach task to subsystem if migration failed

If a task has exited to the point it has called cgroup_exit() already,
then we can't migrate it to another cgroup anymore.

This can happen when we are attaching a task to a new cgroup between the
call to ->can_attach_task() on subsystems and the migration that is
eventually tried in cgroup_task_migrate().

In this case cgroup_task_migrate() returns -ESRCH and we don't want to
attach the task to the subsystems because the attachment to the new cgroup
itself failed.

Fix this by only calling ->attach_task() on the subsystems if the cgroup
migration succeeded.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Acked-by: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 64b0e73..8386b21 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		oldcgrp = task_cgroup_from_root(tsk, root);
 		if (cgrp == oldcgrp)
 			continue;
-		/* attach each task to each subsystem */
-		for_each_subsys(root, ss) {
-			if (ss->attach_task)
-				ss->attach_task(cgrp, tsk);
-		}
 		/* if the thread is PF_EXITING, it can just get skipped. */
 		retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-		BUG_ON(retval != 0 && retval != -ESRCH);
+		if (retval == 0) {
+			/* attach each task to each subsystem */
+			for_each_subsys(root, ss) {
+				if (ss->attach_task)
+					ss->attach_task(cgrp, tsk);
+			}
+		} else {
+			BUG_ON(retval != -ESRCH);
+		}
 	}
 	/* nothing is sensitive to fork() after this point. */
 
-- 
cgit v0.10.2


From ff7ee93f47151e23601856e7eb5510babf956571 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 2 Nov 2011 13:38:11 -0700
Subject: cgroup/kmemleak: Annotate alloc_page() for cgroup allocations

When the cgroup base was allocated with kmalloc, it was necessary to
annotate the variable with kmemleak_not_leak().  But because it has
recently been changed to be allocated with alloc_page() (which skips
kmemleak checks) causes a warning on boot up.

I was triggering this output:

 allocated 8388608 bytes of page_cgroup
 please try 'cgroup_disable=memory' option if you don't want memory cgroups
 kmemleak: Trying to color unknown object at 0xf5840000 as Grey
 Pid: 0, comm: swapper Not tainted 3.0.0-test #12
 Call Trace:
  [<c17e34e6>] ? printk+0x1d/0x1f^M
  [<c10e2941>] paint_ptr+0x4f/0x78
  [<c178ab57>] kmemleak_not_leak+0x58/0x7d
  [<c108ae9f>] ? __rcu_read_unlock+0x9/0x7d
  [<c1cdb462>] kmemleak_init+0x19d/0x1e9
  [<c1cbf771>] start_kernel+0x346/0x3ec
  [<c1cbf1b4>] ? loglevel+0x18/0x18
  [<c1cbf0aa>] i386_start_kernel+0xaa/0xb0

After a bit of debugging I tracked the object 0xf840000 (and others) down
to the cgroup code.  The change from allocating base with kmalloc to
alloc_page() has the base not calling kmemleak_alloc() which adds the
pointer to the object_tree_root, but kmemleak_not_leak() adds it to the
crt_early_log[] table.  On kmemleak_init(), the entry is found in the
early_log[] but not the object_tree_root, and this error message is
displayed.

If alloc_page() fails then it defaults back to vmalloc() which still uses
the kmemleak_alloc() which makes us still need the kmemleak_not_leak()
call.  The solution is to call the kmemleak_alloc() directly if the
alloc_page() succeeds.

Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jonathan Nieder <jrnieder@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6bdc67d..3749ae1 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
 static void *__meminit alloc_page_cgroup(size_t size, int nid)
 {
 	void *addr = NULL;
+	gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
 
-	addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
-	if (addr)
+	addr = alloc_pages_exact_nid(nid, size, flags);
+	if (addr) {
+		kmemleak_alloc(addr, size, 1, flags);
 		return addr;
+	}
 
 	if (node_state(nid, N_HIGH_MEMORY))
 		addr = vmalloc_node(size, nid);
-- 
cgit v0.10.2


From c0ff4b8540a5c158b8e5bafb7d767298b67b0b92 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2011 13:38:15 -0700
Subject: memcg: rename mem variable to memcg

The memcg code sometimes uses "struct mem_cgroup *mem" and sometimes uses
"struct mem_cgroup *memcg".  Rename all mem variables to memcg in source
file.

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ac797fa..05206aa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -78,8 +78,8 @@ extern void mem_cgroup_uncharge_end(void);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
-extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
+extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask);
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -88,19 +88,19 @@ extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *memcg;
 	rcu_read_lock();
-	mem = mem_cgroup_from_task(rcu_dereference((mm)->owner));
+	memcg = mem_cgroup_from_task(rcu_dereference((mm)->owner));
 	rcu_read_unlock();
-	return cgroup == mem;
+	return cgroup == memcg;
 }
 
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem);
+extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 
 extern int
 mem_cgroup_prepare_migration(struct page *page,
 	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask);
-extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
+extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 
 /*
@@ -148,7 +148,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
-u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg);
 
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -244,18 +244,20 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm
 	return NULL;
 }
 
-static inline int mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *mem)
+static inline int mm_match_cgroup(struct mm_struct *mm,
+		struct mem_cgroup *memcg)
 {
 	return 1;
 }
 
 static inline int task_in_mem_cgroup(struct task_struct *task,
-				     const struct mem_cgroup *mem)
+				     const struct mem_cgroup *memcg)
 {
 	return 1;
 }
 
-static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+static inline struct cgroup_subsys_state
+		*mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return NULL;
 }
@@ -267,22 +269,22 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 	return 0;
 }
 
-static inline void mem_cgroup_end_migration(struct mem_cgroup *mem,
+static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
 
-static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
+static inline int mem_cgroup_get_reclaim_priority(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 
-static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
+static inline void mem_cgroup_note_reclaim_priority(struct mem_cgroup *memcg,
 						int priority)
 {
 }
 
-static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
+static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *memcg,
 						int priority)
 {
 }
@@ -348,7 +350,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 }
 
 static inline
-u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d57555..9e38abd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -201,8 +201,8 @@ struct mem_cgroup_eventfd_list {
 	struct eventfd_ctx *eventfd;
 };
 
-static void mem_cgroup_threshold(struct mem_cgroup *mem);
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
 /*
  * The memory controller data structure. The memory controller controls both
@@ -362,29 +362,29 @@ enum charge_type {
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
-static void mem_cgroup_get(struct mem_cgroup *mem);
-static void mem_cgroup_put(struct mem_cgroup *mem);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(struct mem_cgroup *mem);
+static void mem_cgroup_get(struct mem_cgroup *memcg);
+static void mem_cgroup_put(struct mem_cgroup *memcg);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+static void drain_all_stock_async(struct mem_cgroup *memcg);
 
 static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
-	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
 
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
-	return &mem->css;
+	return &memcg->css;
 }
 
 static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
+page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 
-	return mem_cgroup_zoneinfo(mem, nid, zid);
+	return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 
 static struct mem_cgroup_tree_per_zone *
@@ -403,7 +403,7 @@ soft_limit_tree_from_page(struct page *page)
 }
 
 static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
@@ -437,7 +437,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 }
 
 static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
@@ -448,17 +448,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 }
 
 static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
-	__mem_cgroup_remove_exceeded(mem, mz, mctz);
+	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 
 
-static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
@@ -471,9 +471,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
-	for (; mem; mem = parent_mem_cgroup(mem)) {
-		mz = mem_cgroup_zoneinfo(mem, nid, zid);
-		excess = res_counter_soft_limit_excess(&mem->res);
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+		excess = res_counter_soft_limit_excess(&memcg->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
@@ -482,18 +482,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
-				__mem_cgroup_remove_exceeded(mem, mz, mctz);
+				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
-			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
@@ -501,9 +501,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 
 	for_each_node_state(node, N_POSSIBLE) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-			mz = mem_cgroup_zoneinfo(mem, node, zone);
+			mz = mem_cgroup_zoneinfo(memcg, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
-			mem_cgroup_remove_exceeded(mem, mz, mctz);
+			mem_cgroup_remove_exceeded(memcg, mz, mctz);
 		}
 	}
 }
@@ -564,7 +564,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
-static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
@@ -572,81 +572,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
-		val += per_cpu(mem->stat->count[idx], cpu);
+		val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&mem->pcp_counter_lock);
-	val += mem->nocpu_base.count[idx];
-	spin_unlock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
+	val += memcg->nocpu_base.count[idx];
+	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 
-static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 
-void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
 {
-	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
 }
 
-void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
 {
-	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+	this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
 }
 
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		val += per_cpu(mem->stat->events[idx], cpu);
+		val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&mem->pcp_counter_lock);
-	val += mem->nocpu_base.events[idx];
-	spin_unlock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
+	val += memcg->nocpu_base.events[idx];
+	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	return val;
 }
 
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 bool file, int nr_pages)
 {
 	preempt_disable();
 
 	if (file)
-		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
+		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+				nr_pages);
 	else
-		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+				nr_pages);
 
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
-		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
-		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 
-	__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
 
 	preempt_enable();
 }
 
 unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 			unsigned int lru_mask)
 {
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list l;
 	unsigned long ret = 0;
 
-	mz = mem_cgroup_zoneinfo(mem, nid, zid);
+	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 
 	for_each_lru(l) {
 		if (BIT(l) & lru_mask)
@@ -656,44 +658,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
 }
 
 static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			int nid, unsigned int lru_mask)
 {
 	u64 total = 0;
 	int zid;
 
 	for (zid = 0; zid < MAX_NR_ZONES; zid++)
-		total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+		total += mem_cgroup_zone_nr_lru_pages(memcg,
+						nid, zid, lru_mask);
 
 	return total;
 }
 
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 			unsigned int lru_mask)
 {
 	int nid;
 	u64 total = 0;
 
 	for_each_node_state(nid, N_HIGH_MEMORY)
-		total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
+		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 	return total;
 }
 
-static bool __memcg_event_check(struct mem_cgroup *mem, int target)
+static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
 {
 	unsigned long val, next;
 
-	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-	next = this_cpu_read(mem->stat->targets[target]);
+	val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+	next = this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	return ((long)next - (long)val < 0);
 }
 
-static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
 {
 	unsigned long val, next;
 
-	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+	val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 
 	switch (target) {
 	case MEM_CGROUP_TARGET_THRESH:
@@ -709,30 +712,30 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
 		return;
 	}
 
-	this_cpu_write(mem->stat->targets[target], next);
+	this_cpu_write(memcg->stat->targets[target], next);
 }
 
 /*
  * Check events in order.
  *
  */
-static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
 	/* threshold event is triggered in finer grain than soft limit */
-	if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
-		mem_cgroup_threshold(mem);
-		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
-		if (unlikely(__memcg_event_check(mem,
+	if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
+		mem_cgroup_threshold(memcg);
+		__mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
+		if (unlikely(__memcg_event_check(memcg,
 			     MEM_CGROUP_TARGET_SOFTLIMIT))) {
-			mem_cgroup_update_tree(mem, page);
-			__mem_cgroup_target_update(mem,
+			mem_cgroup_update_tree(memcg, page);
+			__mem_cgroup_target_update(memcg,
 						   MEM_CGROUP_TARGET_SOFTLIMIT);
 		}
 #if MAX_NUMNODES > 1
-		if (unlikely(__memcg_event_check(mem,
+		if (unlikely(__memcg_event_check(memcg,
 			MEM_CGROUP_TARGET_NUMAINFO))) {
-			atomic_inc(&mem->numainfo_events);
-			__mem_cgroup_target_update(mem,
+			atomic_inc(&memcg->numainfo_events);
+			__mem_cgroup_target_update(memcg,
 				MEM_CGROUP_TARGET_NUMAINFO);
 		}
 #endif
@@ -762,7 +765,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 
 	if (!mm)
 		return NULL;
@@ -773,25 +776,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 	 */
 	rcu_read_lock();
 	do {
-		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-		if (unlikely(!mem))
+		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+		if (unlikely(!memcg))
 			break;
-	} while (!css_tryget(&mem->css));
+	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
-	return mem;
+	return memcg;
 }
 
 /* The caller has to guarantee "mem" exists before calling this */
-static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
 {
 	struct cgroup_subsys_state *css;
 	int found;
 
-	if (!mem) /* ROOT cgroup has the smallest ID */
+	if (!memcg) /* ROOT cgroup has the smallest ID */
 		return root_mem_cgroup; /*css_put/get against root is ignored*/
-	if (!mem->use_hierarchy) {
-		if (css_tryget(&mem->css))
-			return mem;
+	if (!memcg->use_hierarchy) {
+		if (css_tryget(&memcg->css))
+			return memcg;
 		return NULL;
 	}
 	rcu_read_lock();
@@ -799,13 +802,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
 	 * searching a memory cgroup which has the smallest ID under given
 	 * ROOT cgroup. (ID >= 1)
 	 */
-	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+	css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
 	if (css && css_tryget(css))
-		mem = container_of(css, struct mem_cgroup, css);
+		memcg = container_of(css, struct mem_cgroup, css);
 	else
-		mem = NULL;
+		memcg = NULL;
 	rcu_read_unlock();
-	return mem;
+	return memcg;
 }
 
 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -859,29 +862,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
 	for_each_mem_cgroup_tree_cond(iter, NULL, true)
 
 
-static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
-	return (mem == root_mem_cgroup);
+	return (memcg == root_mem_cgroup);
 }
 
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *memcg;
 
 	if (!mm)
 		return;
 
 	rcu_read_lock();
-	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (unlikely(!mem))
+	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (unlikely(!memcg))
 		goto out;
 
 	switch (idx) {
 	case PGMAJFAULT:
-		mem_cgroup_pgmajfault(mem, 1);
+		mem_cgroup_pgmajfault(memcg, 1);
 		break;
 	case PGFAULT:
-		mem_cgroup_pgfault(mem, 1);
+		mem_cgroup_pgfault(memcg, 1);
 		break;
 	default:
 		BUG();
@@ -1063,21 +1066,21 @@ void mem_cgroup_move_lists(struct page *page,
 }
 
 /*
- * Checks whether given mem is same or in the root_mem's
+ * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
-		struct mem_cgroup *mem)
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+		struct mem_cgroup *memcg)
 {
-	if (root_mem != mem) {
-		return (root_mem->use_hierarchy &&
-			css_is_ancestor(&mem->css, &root_mem->css));
+	if (root_memcg != memcg) {
+		return (root_memcg->use_hierarchy &&
+			css_is_ancestor(&memcg->css, &root_memcg->css));
 	}
 
 	return true;
 }
 
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
@@ -1091,12 +1094,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 	if (!curr)
 		return 0;
 	/*
-	 * We should check use_hierarchy of "mem" not "curr". Because checking
+	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
-	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
-	 * hierarchy(even if use_hierarchy is disabled in "mem").
+	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
+	 * hierarchy(even if use_hierarchy is disabled in "memcg").
 	 */
-	ret = mem_cgroup_same_or_subtree(mem, curr);
+	ret = mem_cgroup_same_or_subtree(memcg, curr);
 	css_put(&curr->css);
 	return ret;
 }
@@ -1254,13 +1257,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
-static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
 	unsigned long long margin;
 
-	margin = res_counter_margin(&mem->res);
+	margin = res_counter_margin(&memcg->res);
 	if (do_swap_account)
-		margin = min(margin, res_counter_margin(&mem->memsw));
+		margin = min(margin, res_counter_margin(&memcg->memsw));
 	return margin >> PAGE_SHIFT;
 }
 
@@ -1275,33 +1278,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 	return memcg->swappiness;
 }
 
-static void mem_cgroup_start_move(struct mem_cgroup *mem)
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
 	int cpu;
 
 	get_online_cpus();
-	spin_lock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
 	for_each_online_cpu(cpu)
-		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
-	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
-	spin_unlock(&mem->pcp_counter_lock);
+		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+	spin_unlock(&memcg->pcp_counter_lock);
 	put_online_cpus();
 
 	synchronize_rcu();
 }
 
-static void mem_cgroup_end_move(struct mem_cgroup *mem)
+static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
 	int cpu;
 
-	if (!mem)
+	if (!memcg)
 		return;
 	get_online_cpus();
-	spin_lock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
 	for_each_online_cpu(cpu)
-		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
-	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
-	spin_unlock(&mem->pcp_counter_lock);
+		per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+	memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+	spin_unlock(&memcg->pcp_counter_lock);
 	put_online_cpus();
 }
 /*
@@ -1316,13 +1319,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
  *			  waiting at hith-memory prressure caused by "move".
  */
 
-static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
-	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+	return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
 }
 
-static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
@@ -1337,17 +1340,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 	if (!from)
 		goto unlock;
 
-	ret = mem_cgroup_same_or_subtree(mem, from)
-		|| mem_cgroup_same_or_subtree(mem, to);
+	ret = mem_cgroup_same_or_subtree(memcg, from)
+		|| mem_cgroup_same_or_subtree(memcg, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 
-static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
 	if (mc.moving_task && current != mc.moving_task) {
-		if (mem_cgroup_under_move(mem)) {
+		if (mem_cgroup_under_move(memcg)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
@@ -1431,12 +1434,12 @@ done:
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
-static int mem_cgroup_count_children(struct mem_cgroup *mem)
+static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		num++;
 	return num;
 }
@@ -1466,21 +1469,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
  * that to reclaim free pages from.
  */
 static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
+mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
 {
 	struct mem_cgroup *ret = NULL;
 	struct cgroup_subsys_state *css;
 	int nextid, found;
 
-	if (!root_mem->use_hierarchy) {
-		css_get(&root_mem->css);
-		ret = root_mem;
+	if (!root_memcg->use_hierarchy) {
+		css_get(&root_memcg->css);
+		ret = root_memcg;
 	}
 
 	while (!ret) {
 		rcu_read_lock();
-		nextid = root_mem->last_scanned_child + 1;
-		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+		nextid = root_memcg->last_scanned_child + 1;
+		css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
 				   &found);
 		if (css && css_tryget(css))
 			ret = container_of(css, struct mem_cgroup, css);
@@ -1489,9 +1492,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 		/* Updates scanning parameter */
 		if (!css) {
 			/* this means start scan from ID:1 */
-			root_mem->last_scanned_child = 0;
+			root_memcg->last_scanned_child = 0;
 		} else
-			root_mem->last_scanned_child = found;
+			root_memcg->last_scanned_child = found;
 	}
 
 	return ret;
@@ -1507,14 +1510,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 		int nid, bool noswap)
 {
-	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
-	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 
@@ -1527,29 +1530,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
-	if (!atomic_read(&mem->numainfo_events))
+	if (!atomic_read(&memcg->numainfo_events))
 		return;
-	if (atomic_inc_return(&mem->numainfo_updating) > 1)
+	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
 		return;
 
 	/* make a nodemask where this memcg uses memory from */
-	mem->scan_nodes = node_states[N_HIGH_MEMORY];
+	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
 
 	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 
-		if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
-			node_clear(nid, mem->scan_nodes);
+		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
+			node_clear(nid, memcg->scan_nodes);
 	}
 
-	atomic_set(&mem->numainfo_events, 0);
-	atomic_set(&mem->numainfo_updating, 0);
+	atomic_set(&memcg->numainfo_events, 0);
+	atomic_set(&memcg->numainfo_updating, 0);
 }
 
 /*
@@ -1564,16 +1567,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	int node;
 
-	mem_cgroup_may_update_nodemask(mem);
-	node = mem->last_scanned_node;
+	mem_cgroup_may_update_nodemask(memcg);
+	node = memcg->last_scanned_node;
 
-	node = next_node(node, mem->scan_nodes);
+	node = next_node(node, memcg->scan_nodes);
 	if (node == MAX_NUMNODES)
-		node = first_node(mem->scan_nodes);
+		node = first_node(memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1583,7 +1586,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 
-	mem->last_scanned_node = node;
+	memcg->last_scanned_node = node;
 	return node;
 }
 
@@ -1593,7 +1596,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 
@@ -1601,12 +1604,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
-	if (!nodes_empty(mem->scan_nodes)) {
-		for (nid = first_node(mem->scan_nodes);
+	if (!nodes_empty(memcg->scan_nodes)) {
+		for (nid = first_node(memcg->scan_nodes);
 		     nid < MAX_NUMNODES;
-		     nid = next_node(nid, mem->scan_nodes)) {
+		     nid = next_node(nid, memcg->scan_nodes)) {
 
-			if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 				return true;
 		}
 	}
@@ -1614,23 +1617,23 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_HIGH_MEMORY) {
-		if (node_isset(nid, mem->scan_nodes))
+		if (node_isset(nid, memcg->scan_nodes))
 			continue;
-		if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 			return true;
 	}
 	return false;
 }
 
 #else
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
-	return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
 
@@ -1639,14 +1642,14 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
  * we reclaimed from, so that we don't end up penalizing one child extensively
  * based on its position in the children list.
  *
- * root_mem is the original ancestor that we've been reclaim from.
+ * root_memcg is the original ancestor that we've been reclaim from.
  *
- * We give up and return to the caller when we visit root_mem twice.
+ * We give up and return to the caller when we visit root_memcg twice.
  * (other groups can be removed while we're walking....)
  *
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
 						struct zone *zone,
 						gfp_t gfp_mask,
 						unsigned long reclaim_options,
@@ -1661,15 +1664,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 	unsigned long excess;
 	unsigned long nr_scanned;
 
-	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
+	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 
 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
-	if (!check_soft && !shrink && root_mem->memsw_is_minimum)
+	if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
 		noswap = true;
 
 	while (1) {
-		victim = mem_cgroup_select_victim(root_mem);
-		if (victim == root_mem) {
+		victim = mem_cgroup_select_victim(root_memcg);
+		if (victim == root_memcg) {
 			loop++;
 			/*
 			 * We are not draining per cpu cached charges during
@@ -1678,7 +1681,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 			 * charges will not give any.
 			 */
 			if (!check_soft && loop >= 1)
-				drain_all_stock_async(root_mem);
+				drain_all_stock_async(root_memcg);
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
@@ -1725,9 +1728,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 			return ret;
 		total += ret;
 		if (check_soft) {
-			if (!res_counter_soft_limit_excess(&root_mem->res))
+			if (!res_counter_soft_limit_excess(&root_memcg->res))
 				return total;
-		} else if (mem_cgroup_margin(root_mem))
+		} else if (mem_cgroup_margin(root_memcg))
 			return total;
 	}
 	return total;
@@ -1738,12 +1741,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
  * If someone is running, return false.
  * Has to be called with memcg_oom_lock
  */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	bool cond = true;
 
-	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
 		if (iter->oom_lock) {
 			/*
 			 * this subtree of our hierarchy is already locked
@@ -1763,7 +1766,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 	 * what we set up to the failing subtree
 	 */
 	cond = true;
-	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+	for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
 		if (iter == failed) {
 			cond = false;
 			continue;
@@ -1776,24 +1779,24 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 /*
  * Has to be called with memcg_oom_lock
  */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	return 0;
 }
 
-static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_inc(&iter->under_oom);
 }
 
-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 
@@ -1802,7 +1805,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 
@@ -1817,80 +1820,80 @@ struct oom_wait_info {
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
-	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
-			  *oom_wait_mem;
+	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
+			  *oom_wait_memcg;
 	struct oom_wait_info *oom_wait_info;
 
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
-	oom_wait_mem = oom_wait_info->mem;
+	oom_wait_memcg = oom_wait_info->mem;
 
 	/*
 	 * Both of oom_wait_info->mem and wake_mem are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
-	if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
-			&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
+	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
+		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 
-static void memcg_wakeup_oom(struct mem_cgroup *mem)
+static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
-	/* for filtering, pass "mem" as argument. */
-	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+	/* for filtering, pass "memcg" as argument. */
+	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 
-static void memcg_oom_recover(struct mem_cgroup *mem)
+static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
-	if (mem && atomic_read(&mem->under_oom))
-		memcg_wakeup_oom(mem);
+	if (memcg && atomic_read(&memcg->under_oom))
+		memcg_wakeup_oom(memcg);
 }
 
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
-bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
 
-	owait.mem = mem;
+	owait.mem = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	need_to_kill = true;
-	mem_cgroup_mark_under_oom(mem);
+	mem_cgroup_mark_under_oom(memcg);
 
-	/* At first, try to OOM lock hierarchy under mem.*/
+	/* At first, try to OOM lock hierarchy under memcg.*/
 	spin_lock(&memcg_oom_lock);
-	locked = mem_cgroup_oom_lock(mem);
+	locked = mem_cgroup_oom_lock(memcg);
 	/*
 	 * Even if signal_pending(), we can't quit charge() loop without
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	if (!locked || mem->oom_kill_disable)
+	if (!locked || memcg->oom_kill_disable)
 		need_to_kill = false;
 	if (locked)
-		mem_cgroup_oom_notify(mem);
+		mem_cgroup_oom_notify(memcg);
 	spin_unlock(&memcg_oom_lock);
 
 	if (need_to_kill) {
 		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(mem, mask);
+		mem_cgroup_out_of_memory(memcg, mask);
 	} else {
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	spin_lock(&memcg_oom_lock);
 	if (locked)
-		mem_cgroup_oom_unlock(mem);
-	memcg_wakeup_oom(mem);
+		mem_cgroup_oom_unlock(memcg);
+	memcg_wakeup_oom(memcg);
 	spin_unlock(&memcg_oom_lock);
 
-	mem_cgroup_unmark_under_oom(mem);
+	mem_cgroup_unmark_under_oom(memcg);
 
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
@@ -1926,7 +1929,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_page_stat_item idx, int val)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *memcg;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	bool need_unlock = false;
 	unsigned long uninitialized_var(flags);
@@ -1935,16 +1938,16 @@ void mem_cgroup_update_page_stat(struct page *page,
 		return;
 
 	rcu_read_lock();
-	mem = pc->mem_cgroup;
-	if (unlikely(!mem || !PageCgroupUsed(pc)))
+	memcg = pc->mem_cgroup;
+	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		goto out;
 	/* pc->mem_cgroup is unstable ? */
-	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+	if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
 		/* take a lock against to access pc->mem_cgroup */
 		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
-		mem = pc->mem_cgroup;
-		if (!mem || !PageCgroupUsed(pc))
+		memcg = pc->mem_cgroup;
+		if (!memcg || !PageCgroupUsed(pc))
 			goto out;
 	}
 
@@ -1960,7 +1963,7 @@ void mem_cgroup_update_page_stat(struct page *page,
 		BUG();
 	}
 
-	this_cpu_add(mem->stat->count[idx], val);
+	this_cpu_add(memcg->stat->count[idx], val);
 
 out:
 	if (unlikely(need_unlock))
@@ -1991,13 +1994,13 @@ static DEFINE_MUTEX(percpu_charge_mutex);
  * cgroup which is not current target, returns false. This stock will be
  * refilled.
  */
-static bool consume_stock(struct mem_cgroup *mem)
+static bool consume_stock(struct mem_cgroup *memcg)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 
 	stock = &get_cpu_var(memcg_stock);
-	if (mem == stock->cached && stock->nr_pages)
+	if (memcg == stock->cached && stock->nr_pages)
 		stock->nr_pages--;
 	else /* need to call res_counter_charge */
 		ret = false;
@@ -2038,24 +2041,24 @@ static void drain_local_stock(struct work_struct *dummy)
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
-static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 
-	if (stock->cached != mem) { /* reset if necessary */
+	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
-		stock->cached = mem;
+		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 
 /*
- * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
-static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
 	int cpu, curcpu;
 
@@ -2064,12 +2067,12 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-		struct mem_cgroup *mem;
+		struct mem_cgroup *memcg;
 
-		mem = stock->cached;
-		if (!mem || !stock->nr_pages)
+		memcg = stock->cached;
+		if (!memcg || !stock->nr_pages)
 			continue;
-		if (!mem_cgroup_same_or_subtree(root_mem, mem))
+		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
@@ -2098,23 +2101,23 @@ out:
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
 	/*
 	 * If someone calls draining, avoid adding more kworker runs.
 	 */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
-	drain_all_stock(root_mem, false);
+	drain_all_stock(root_memcg, false);
 	mutex_unlock(&percpu_charge_mutex);
 }
 
 /* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_mem)
+static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
 	/* called when force_empty is called */
 	mutex_lock(&percpu_charge_mutex);
-	drain_all_stock(root_mem, true);
+	drain_all_stock(root_memcg, true);
 	mutex_unlock(&percpu_charge_mutex);
 }
 
@@ -2122,35 +2125,35 @@ static void drain_all_stock_sync(struct mem_cgroup *root_mem)
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
 	int i;
 
-	spin_lock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
-		long x = per_cpu(mem->stat->count[i], cpu);
+		long x = per_cpu(memcg->stat->count[i], cpu);
 
-		per_cpu(mem->stat->count[i], cpu) = 0;
-		mem->nocpu_base.count[i] += x;
+		per_cpu(memcg->stat->count[i], cpu) = 0;
+		memcg->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-		unsigned long x = per_cpu(mem->stat->events[i], cpu);
+		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
 
-		per_cpu(mem->stat->events[i], cpu) = 0;
-		mem->nocpu_base.events[i] += x;
+		per_cpu(memcg->stat->events[i], cpu) = 0;
+		memcg->nocpu_base.events[i] += x;
 	}
 	/* need to clear ON_MOVE value, works as a kind of lock. */
-	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
-	spin_unlock(&mem->pcp_counter_lock);
+	per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+	spin_unlock(&memcg->pcp_counter_lock);
 }
 
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
 {
 	int idx = MEM_CGROUP_ON_MOVE;
 
-	spin_lock(&mem->pcp_counter_lock);
-	per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
-	spin_unlock(&mem->pcp_counter_lock);
+	spin_lock(&memcg->pcp_counter_lock);
+	per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
+	spin_unlock(&memcg->pcp_counter_lock);
 }
 
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2188,7 +2191,7 @@ enum {
 	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 
-static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2197,16 +2200,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 	unsigned long flags = 0;
 	int ret;
 
-	ret = res_counter_charge(&mem->res, csize, &fail_res);
+	ret = res_counter_charge(&memcg->res, csize, &fail_res);
 
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
-		ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 
-		res_counter_uncharge(&mem->res, csize);
+		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
@@ -2264,12 +2267,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				   gfp_t gfp_mask,
 				   unsigned int nr_pages,
-				   struct mem_cgroup **memcg,
+				   struct mem_cgroup **ptr,
 				   bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	int ret;
 
 	/*
@@ -2287,17 +2290,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
-	if (!*memcg && !mm)
+	if (!*ptr && !mm)
 		goto bypass;
 again:
-	if (*memcg) { /* css should be a valid one */
-		mem = *memcg;
-		VM_BUG_ON(css_is_removed(&mem->css));
-		if (mem_cgroup_is_root(mem))
+	if (*ptr) { /* css should be a valid one */
+		memcg = *ptr;
+		VM_BUG_ON(css_is_removed(&memcg->css));
+		if (mem_cgroup_is_root(memcg))
 			goto done;
-		if (nr_pages == 1 && consume_stock(mem))
+		if (nr_pages == 1 && consume_stock(memcg))
 			goto done;
-		css_get(&mem->css);
+		css_get(&memcg->css);
 	} else {
 		struct task_struct *p;
 
@@ -2305,7 +2308,7 @@ again:
 		p = rcu_dereference(mm->owner);
 		/*
 		 * Because we don't have task_lock(), "p" can exit.
-		 * In that case, "mem" can point to root or p can be NULL with
+		 * In that case, "memcg" can point to root or p can be NULL with
 		 * race with swapoff. Then, we have small risk of mis-accouning.
 		 * But such kind of mis-account by race always happens because
 		 * we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2313,12 +2316,12 @@ again:
 		 * (*) swapoff at el will charge against mm-struct not against
 		 * task-struct. So, mm->owner can be NULL.
 		 */
-		mem = mem_cgroup_from_task(p);
-		if (!mem || mem_cgroup_is_root(mem)) {
+		memcg = mem_cgroup_from_task(p);
+		if (!memcg || mem_cgroup_is_root(memcg)) {
 			rcu_read_unlock();
 			goto done;
 		}
-		if (nr_pages == 1 && consume_stock(mem)) {
+		if (nr_pages == 1 && consume_stock(memcg)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
@@ -2331,7 +2334,7 @@ again:
 			goto done;
 		}
 		/* after here, we may be blocked. we need to get refcnt */
-		if (!css_tryget(&mem->css)) {
+		if (!css_tryget(&memcg->css)) {
 			rcu_read_unlock();
 			goto again;
 		}
@@ -2343,7 +2346,7 @@ again:
 
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
-			css_put(&mem->css);
+			css_put(&memcg->css);
 			goto bypass;
 		}
 
@@ -2353,43 +2356,43 @@ again:
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 
-		ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
+		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
-			css_put(&mem->css);
-			mem = NULL;
+			css_put(&memcg->css);
+			memcg = NULL;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-			css_put(&mem->css);
+			css_put(&memcg->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom) {
-				css_put(&mem->css);
+				css_put(&memcg->css);
 				goto nomem;
 			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-			css_put(&mem->css);
+			css_put(&memcg->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 
 	if (batch > nr_pages)
-		refill_stock(mem, batch - nr_pages);
-	css_put(&mem->css);
+		refill_stock(memcg, batch - nr_pages);
+	css_put(&memcg->css);
 done:
-	*memcg = mem;
+	*ptr = memcg;
 	return 0;
 nomem:
-	*memcg = NULL;
+	*ptr = NULL;
 	return -ENOMEM;
 bypass:
-	*memcg = NULL;
+	*ptr = NULL;
 	return 0;
 }
 
@@ -2398,15 +2401,15 @@ bypass:
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 				       unsigned int nr_pages)
 {
-	if (!mem_cgroup_is_root(mem)) {
+	if (!mem_cgroup_is_root(memcg)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 
-		res_counter_uncharge(&mem->res, bytes);
+		res_counter_uncharge(&memcg->res, bytes);
 		if (do_swap_account)
-			res_counter_uncharge(&mem->memsw, bytes);
+			res_counter_uncharge(&memcg->memsw, bytes);
 	}
 }
 
@@ -2431,7 +2434,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
@@ -2441,23 +2444,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
-		mem = pc->mem_cgroup;
-		if (mem && !css_tryget(&mem->css))
-			mem = NULL;
+		memcg = pc->mem_cgroup;
+		if (memcg && !css_tryget(&memcg->css))
+			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup(ent);
 		rcu_read_lock();
-		mem = mem_cgroup_lookup(id);
-		if (mem && !css_tryget(&mem->css))
-			mem = NULL;
+		memcg = mem_cgroup_lookup(id);
+		if (memcg && !css_tryget(&memcg->css))
+			memcg = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
-	return mem;
+	return memcg;
 }
 
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       struct page_cgroup *pc,
@@ -2466,14 +2469,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
-		__mem_cgroup_cancel_charge(mem, nr_pages);
+		__mem_cgroup_cancel_charge(memcg, nr_pages);
 		return;
 	}
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
-	pc->mem_cgroup = mem;
+	pc->mem_cgroup = memcg;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2496,14 +2499,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 		break;
 	}
 
-	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
-	memcg_check_events(mem, page);
+	memcg_check_events(memcg, page);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2690,7 +2693,7 @@ out:
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool oom = true;
@@ -2709,11 +2712,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	pc = lookup_page_cgroup(page);
 	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
 
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
-	if (ret || !mem)
+	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
+	if (ret || !memcg)
 		return ret;
 
-	__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
+	__mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
 	return 0;
 }
 
@@ -2742,7 +2745,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype);
 
 static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
 					enum charge_type ctype)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2752,7 +2755,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
 	 * LRU. Take care of it.
 	 */
 	mem_cgroup_lru_del_before_commit(page);
-	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
 	mem_cgroup_lru_add_after_commit(page);
 	return;
 }
@@ -2760,7 +2763,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	int ret;
 
 	if (mem_cgroup_disabled())
@@ -2772,8 +2775,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 		mm = &init_mm;
 
 	if (page_is_file_cache(page)) {
-		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
-		if (ret || !mem)
+		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
+		if (ret || !memcg)
 			return ret;
 
 		/*
@@ -2781,15 +2784,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 		 * put that would remove them from the LRU list, make
 		 * sure that they get relinked properly.
 		 */
-		__mem_cgroup_commit_charge_lrucare(page, mem,
+		__mem_cgroup_commit_charge_lrucare(page, memcg,
 					MEM_CGROUP_CHARGE_TYPE_CACHE);
 		return ret;
 	}
 	/* shmem */
 	if (PageSwapCache(page)) {
-		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
 		if (!ret)
-			__mem_cgroup_commit_charge_swapin(page, mem,
+			__mem_cgroup_commit_charge_swapin(page, memcg,
 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
 	} else
 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
@@ -2808,7 +2811,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 				 struct page *page,
 				 gfp_t mask, struct mem_cgroup **ptr)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *memcg;
 	int ret;
 
 	*ptr = NULL;
@@ -2826,12 +2829,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 	 */
 	if (!PageSwapCache(page))
 		goto charge_cur_mm;
-	mem = try_get_mem_cgroup_from_page(page);
-	if (!mem)
+	memcg = try_get_mem_cgroup_from_page(page);
+	if (!memcg)
 		goto charge_cur_mm;
-	*ptr = mem;
+	*ptr = memcg;
 	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
-	css_put(&mem->css);
+	css_put(&memcg->css);
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
@@ -2891,16 +2894,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
 		return;
-	if (!mem)
+	if (!memcg)
 		return;
-	__mem_cgroup_cancel_charge(mem, 1);
+	__mem_cgroup_cancel_charge(memcg, 1);
 }
 
-static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
@@ -2918,7 +2921,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
-		batch->memcg = mem;
+		batch->memcg = memcg;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
@@ -2938,7 +2941,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
-	if (batch->memcg != mem)
+	if (batch->memcg != memcg)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
@@ -2946,11 +2949,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
-	res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
+	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
-		res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
-	if (unlikely(batch->memcg != mem))
-		memcg_oom_recover(mem);
+		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
+	if (unlikely(batch->memcg != memcg))
+		memcg_oom_recover(memcg);
 	return;
 }
 
@@ -2960,7 +2963,7 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 
@@ -2983,7 +2986,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 	lock_page_cgroup(pc);
 
-	mem = pc->mem_cgroup;
+	memcg = pc->mem_cgroup;
 
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
@@ -3006,7 +3009,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
+	mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
 
 	ClearPageCgroupUsed(pc);
 	/*
@@ -3018,18 +3021,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 
 	unlock_page_cgroup(pc);
 	/*
-	 * even after unlock, we have mem->res.usage here and this memcg
+	 * even after unlock, we have memcg->res.usage here and this memcg
 	 * will never be freed.
 	 */
-	memcg_check_events(mem, page);
+	memcg_check_events(memcg, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-		mem_cgroup_swap_statistics(mem, true);
-		mem_cgroup_get(mem);
+		mem_cgroup_swap_statistics(memcg, true);
+		mem_cgroup_get(memcg);
 	}
-	if (!mem_cgroup_is_root(mem))
-		mem_cgroup_do_uncharge(mem, nr_pages, ctype);
+	if (!mem_cgroup_is_root(memcg))
+		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
 
-	return mem;
+	return memcg;
 
 unlock_out:
 	unlock_page_cgroup(pc);
@@ -3219,7 +3222,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 int mem_cgroup_prepare_migration(struct page *page,
 	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
 {
-	struct mem_cgroup *mem = NULL;
+	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	int ret = 0;
@@ -3233,8 +3236,8 @@ int mem_cgroup_prepare_migration(struct page *page,
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
-		mem = pc->mem_cgroup;
-		css_get(&mem->css);
+		memcg = pc->mem_cgroup;
+		css_get(&memcg->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
@@ -3272,12 +3275,12 @@ int mem_cgroup_prepare_migration(struct page *page,
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
-	if (!mem)
+	if (!memcg)
 		return 0;
 
-	*ptr = mem;
+	*ptr = memcg;
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
-	css_put(&mem->css);/* drop extra refcnt */
+	css_put(&memcg->css);/* drop extra refcnt */
 	if (ret || *ptr == NULL) {
 		if (PageAnon(page)) {
 			lock_page_cgroup(pc);
@@ -3303,21 +3306,21 @@ int mem_cgroup_prepare_migration(struct page *page,
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+	__mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
 	return ret;
 }
 
 /* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *mem,
+void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 
-	if (!mem)
+	if (!memcg)
 		return;
 	/* blocks rmdir() */
-	cgroup_exclude_rmdir(&mem->css);
+	cgroup_exclude_rmdir(&memcg->css);
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
@@ -3353,7 +3356,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
-	cgroup_release_and_wakeup_rmdir(&mem->css);
+	cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -3432,7 +3435,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
-		 * We have to guarantee mem->res.limit < mem->memsw.limit.
+		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3494,7 +3497,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
-		 * We have to guarantee mem->res.limit < mem->memsw.limit.
+		 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3632,7 +3635,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
 	struct zone *zone;
@@ -3643,7 +3646,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 	int ret = 0;
 
 	zone = &NODE_DATA(node)->node_zones[zid];
-	mz = mem_cgroup_zoneinfo(mem, node, zid);
+	mz = mem_cgroup_zoneinfo(memcg, node, zid);
 	list = &mz->lists[lru];
 
 	loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3670,7 +3673,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 
 		page = lookup_cgroup_page(pc);
 
-		ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
+		ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
 		if (ret == -ENOMEM)
 			break;
 
@@ -3691,14 +3694,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
-static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct cgroup *cgrp = mem->css.cgroup;
+	struct cgroup *cgrp = memcg->css.cgroup;
 
-	css_get(&mem->css);
+	css_get(&memcg->css);
 
 	shrink = 0;
 	/* should free all ? */
@@ -3714,14 +3717,14 @@ move_account:
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
-		drain_all_stock_sync(mem);
+		drain_all_stock_sync(memcg);
 		ret = 0;
-		mem_cgroup_start_move(mem);
+		mem_cgroup_start_move(memcg);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list l;
 				for_each_lru(l) {
-					ret = mem_cgroup_force_empty_list(mem,
+					ret = mem_cgroup_force_empty_list(memcg,
 							node, zid, l);
 					if (ret)
 						break;
@@ -3730,16 +3733,16 @@ move_account:
 			if (ret)
 				break;
 		}
-		mem_cgroup_end_move(mem);
-		memcg_oom_recover(mem);
+		mem_cgroup_end_move(memcg);
+		memcg_oom_recover(memcg);
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
-	} while (mem->res.usage > 0 || ret);
+	} while (memcg->res.usage > 0 || ret);
 out:
-	css_put(&mem->css);
+	css_put(&memcg->css);
 	return ret;
 
 try_to_free:
@@ -3752,14 +3755,14 @@ try_to_free:
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
-	while (nr_retries && mem->res.usage > 0) {
+	while (nr_retries && memcg->res.usage > 0) {
 		int progress;
 
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
-		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
 						false);
 		if (!progress) {
 			nr_retries--;
@@ -3788,12 +3791,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 					u64 val)
 {
 	int retval = 0;
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct cgroup *parent = cont->parent;
-	struct mem_cgroup *parent_mem = NULL;
+	struct mem_cgroup *parent_memcg = NULL;
 
 	if (parent)
-		parent_mem = mem_cgroup_from_cont(parent);
+		parent_memcg = mem_cgroup_from_cont(parent);
 
 	cgroup_lock();
 	/*
@@ -3804,10 +3807,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
-	if ((!parent_mem || !parent_mem->use_hierarchy) &&
+	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&cont->children))
-			mem->use_hierarchy = val;
+			memcg->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
@@ -3818,14 +3821,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 }
 
 
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 
 	/* Per-cpu values can be negative, use a signed accumulator */
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 
 	if (val < 0) /* race ? */
@@ -3833,29 +3836,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
 	return val;
 }
 
-static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 
-	if (!mem_cgroup_is_root(mem)) {
+	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
-			return res_counter_read_u64(&mem->res, RES_USAGE);
+			return res_counter_read_u64(&memcg->res, RES_USAGE);
 		else
-			return res_counter_read_u64(&mem->memsw, RES_USAGE);
+			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	}
 
-	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
-	val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
+	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
 
 	if (swap)
-		val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
 
 	return val << PAGE_SHIFT;
 }
 
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	u64 val;
 	int type, name;
 
@@ -3864,15 +3867,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
-			val = mem_cgroup_usage(mem, false);
+			val = mem_cgroup_usage(memcg, false);
 		else
-			val = res_counter_read_u64(&mem->res, name);
+			val = res_counter_read_u64(&memcg->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
-			val = mem_cgroup_usage(mem, true);
+			val = mem_cgroup_usage(memcg, true);
 		else
-			val = res_counter_read_u64(&mem->memsw, name);
+			val = res_counter_read_u64(&memcg->memsw, name);
 		break;
 	default:
 		BUG();
@@ -3960,24 +3963,24 @@ out:
 
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
-	struct mem_cgroup *mem;
+	struct mem_cgroup *memcg;
 	int type, name;
 
-	mem = mem_cgroup_from_cont(cont);
+	memcg = mem_cgroup_from_cont(cont);
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
-			res_counter_reset_max(&mem->res);
+			res_counter_reset_max(&memcg->res);
 		else
-			res_counter_reset_max(&mem->memsw);
+			res_counter_reset_max(&memcg->memsw);
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
-			res_counter_reset_failcnt(&mem->res);
+			res_counter_reset_failcnt(&memcg->res);
 		else
-			res_counter_reset_failcnt(&mem->memsw);
+			res_counter_reset_failcnt(&memcg->memsw);
 		break;
 	}
 
@@ -3994,7 +3997,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
@@ -4004,7 +4007,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 	 * inconsistent.
 	 */
 	cgroup_lock();
-	mem->move_charge_at_immigrate = val;
+	memcg->move_charge_at_immigrate = val;
 	cgroup_unlock();
 
 	return 0;
@@ -4061,49 +4064,49 @@ struct {
 
 
 static void
-mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
 {
 	s64 val;
 
 	/* per cpu stat */
-	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+	val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
+	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
 	s->stat[MCS_PGPGIN] += val;
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
+	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
 	s->stat[MCS_PGPGOUT] += val;
 	if (do_swap_account) {
-		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+		val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
 	s->stat[MCS_PGFAULT] += val;
-	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+	val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
 	s->stat[MCS_PGMAJFAULT] += val;
 
 	/* per zone stat */
-	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
+	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
+	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
+	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
+	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
+	val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
 
 static void
-mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
 {
 	struct mem_cgroup *iter;
 
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_get_local_stat(iter, s);
 }
 
@@ -4327,20 +4330,20 @@ static int compare_thresholds(const void *a, const void *b)
 	return _a->threshold - _b->threshold;
 }
 
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 
-	list_for_each_entry(ev, &mem->oom_notify, list)
+	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 
-	for_each_mem_cgroup_tree(iter, mem)
+	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 
@@ -4530,7 +4533,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	int type = MEMFILE_TYPE(cft->private);
 
@@ -4538,7 +4541,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 
 	spin_lock(&memcg_oom_lock);
 
-	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
@@ -4551,11 +4554,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 	struct cftype *cft,  struct cgroup_map_cb *cb)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 
-	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
 
-	if (atomic_read(&mem->under_oom))
+	if (atomic_read(&memcg->under_oom))
 		cb->fill(cb, "under_oom", 1);
 	else
 		cb->fill(cb, "under_oom", 0);
@@ -4565,7 +4568,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	struct cftype *cft, u64 val)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4577,13 +4580,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	cgroup_lock();
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) ||
-	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
-	mem->oom_kill_disable = val;
+	memcg->oom_kill_disable = val;
 	if (!val)
-		memcg_oom_recover(mem);
+		memcg_oom_recover(memcg);
 	cgroup_unlock();
 	return 0;
 }
@@ -4719,7 +4722,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 }
 #endif
 
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
@@ -4739,21 +4742,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 	if (!pn)
 		return 1;
 
-	mem->info.nodeinfo[node] = pn;
+	memcg->info.nodeinfo[node] = pn;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
-		mz->mem = mem;
+		mz->mem = memcg;
 	}
 	return 0;
 }
 
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
-	kfree(mem->info.nodeinfo[node]);
+	kfree(memcg->info.nodeinfo[node]);
 }
 
 static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -4795,51 +4798,51 @@ out_free:
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 
-static void __mem_cgroup_free(struct mem_cgroup *mem)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 
-	mem_cgroup_remove_from_trees(mem);
-	free_css_id(&mem_cgroup_subsys, &mem->css);
+	mem_cgroup_remove_from_trees(memcg);
+	free_css_id(&mem_cgroup_subsys, &memcg->css);
 
 	for_each_node_state(node, N_POSSIBLE)
-		free_mem_cgroup_per_zone_info(mem, node);
+		free_mem_cgroup_per_zone_info(memcg, node);
 
-	free_percpu(mem->stat);
+	free_percpu(memcg->stat);
 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-		kfree(mem);
+		kfree(memcg);
 	else
-		vfree(mem);
+		vfree(memcg);
 }
 
-static void mem_cgroup_get(struct mem_cgroup *mem)
+static void mem_cgroup_get(struct mem_cgroup *memcg)
 {
-	atomic_inc(&mem->refcnt);
+	atomic_inc(&memcg->refcnt);
 }
 
-static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
+static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
-	if (atomic_sub_and_test(count, &mem->refcnt)) {
-		struct mem_cgroup *parent = parent_mem_cgroup(mem);
-		__mem_cgroup_free(mem);
+	if (atomic_sub_and_test(count, &memcg->refcnt)) {
+		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+		__mem_cgroup_free(memcg);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
 }
 
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void mem_cgroup_put(struct mem_cgroup *memcg)
 {
-	__mem_cgroup_put(mem, 1);
+	__mem_cgroup_put(memcg, 1);
 }
 
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
-	if (!mem->res.parent)
+	if (!memcg->res.parent)
 		return NULL;
-	return mem_cgroup_from_res_counter(mem->res.parent, res);
+	return mem_cgroup_from_res_counter(memcg->res.parent, res);
 }
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4882,16 +4885,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-	struct mem_cgroup *mem, *parent;
+	struct mem_cgroup *memcg, *parent;
 	long error = -ENOMEM;
 	int node;
 
-	mem = mem_cgroup_alloc();
-	if (!mem)
+	memcg = mem_cgroup_alloc();
+	if (!memcg)
 		return ERR_PTR(error);
 
 	for_each_node_state(node, N_POSSIBLE)
-		if (alloc_mem_cgroup_per_zone_info(mem, node))
+		if (alloc_mem_cgroup_per_zone_info(memcg, node))
 			goto free_out;
 
 	/* root ? */
@@ -4899,7 +4902,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		int cpu;
 		enable_swap_cgroup();
 		parent = NULL;
-		root_mem_cgroup = mem;
+		root_mem_cgroup = memcg;
 		if (mem_cgroup_soft_limit_tree_init())
 			goto free_out;
 		for_each_possible_cpu(cpu) {
@@ -4910,13 +4913,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
-		mem->use_hierarchy = parent->use_hierarchy;
-		mem->oom_kill_disable = parent->oom_kill_disable;
+		memcg->use_hierarchy = parent->use_hierarchy;
+		memcg->oom_kill_disable = parent->oom_kill_disable;
 	}
 
 	if (parent && parent->use_hierarchy) {
-		res_counter_init(&mem->res, &parent->res);
-		res_counter_init(&mem->memsw, &parent->memsw);
+		res_counter_init(&memcg->res, &parent->res);
+		res_counter_init(&memcg->memsw, &parent->memsw);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
@@ -4925,21 +4928,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		 */
 		mem_cgroup_get(parent);
 	} else {
-		res_counter_init(&mem->res, NULL);
-		res_counter_init(&mem->memsw, NULL);
+		res_counter_init(&memcg->res, NULL);
+		res_counter_init(&memcg->memsw, NULL);
 	}
-	mem->last_scanned_child = 0;
-	mem->last_scanned_node = MAX_NUMNODES;
-	INIT_LIST_HEAD(&mem->oom_notify);
+	memcg->last_scanned_child = 0;
+	memcg->last_scanned_node = MAX_NUMNODES;
+	INIT_LIST_HEAD(&memcg->oom_notify);
 
 	if (parent)
-		mem->swappiness = mem_cgroup_swappiness(parent);
-	atomic_set(&mem->refcnt, 1);
-	mem->move_charge_at_immigrate = 0;
-	mutex_init(&mem->thresholds_lock);
-	return &mem->css;
+		memcg->swappiness = mem_cgroup_swappiness(parent);
+	atomic_set(&memcg->refcnt, 1);
+	memcg->move_charge_at_immigrate = 0;
+	mutex_init(&memcg->thresholds_lock);
+	return &memcg->css;
 free_out:
-	__mem_cgroup_free(mem);
+	__mem_cgroup_free(memcg);
 	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
@@ -4947,17 +4950,17 @@ free_out:
 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	return mem_cgroup_force_empty(mem, false);
+	return mem_cgroup_force_empty(memcg, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	mem_cgroup_put(mem);
+	mem_cgroup_put(memcg);
 }
 
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -4980,9 +4983,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
-	struct mem_cgroup *mem = mc.to;
+	struct mem_cgroup *memcg = mc.to;
 
-	if (mem_cgroup_is_root(mem)) {
+	if (mem_cgroup_is_root(memcg)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
@@ -4991,16 +4994,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
-		 * "mem" cannot be under rmdir() because we've already checked
+		 * "memcg" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
-		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
-		if (do_swap_account && res_counter_charge(&mem->memsw,
+		if (do_swap_account && res_counter_charge(&memcg->memsw,
 						PAGE_SIZE * count, &dummy)) {
-			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
@@ -5017,8 +5020,9 @@ one_by_one:
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
-		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
-		if (ret || !mem)
+		ret = __mem_cgroup_try_charge(NULL,
+					GFP_KERNEL, 1, &memcg, false);
+		if (ret || !memcg)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return -ENOMEM;
 		mc.precharge++;
@@ -5292,13 +5296,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct task_struct *p)
 {
 	int ret = 0;
-	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
 
-	if (mem->move_charge_at_immigrate) {
+	if (memcg->move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 
-		VM_BUG_ON(from == mem);
+		VM_BUG_ON(from == memcg);
 
 		mm = get_task_mm(p);
 		if (!mm)
@@ -5313,7 +5317,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
-			mc.to = mem;
+			mc.to = memcg;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 
-- 
cgit v0.10.2


From 715a5ee82ab3c07430f748630044354132add5ad Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 2 Nov 2011 13:38:18 -0700
Subject: memcg: fix oom schedule_timeout()

Before calling schedule_timeout(), task state should be changed.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9e38abd..c02d870 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1898,7 +1898,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
 	/* Give chance to dying process */
-	schedule_timeout(1);
+	schedule_timeout_uninterruptible(1);
 	return true;
 }
 
-- 
cgit v0.10.2


From 0a619e58703b86d53d07e938eade9a91a4a863c6 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Wed, 2 Nov 2011 13:38:21 -0700
Subject: memcg: do not expose uninitialized mem_cgroup_per_node to world

If somebody is touching data too early, it might be easier to diagnose a
problem when dereferencing NULL at mem->info.nodeinfo[node] than trying to
understand why mem_cgroup_per_zone is [un|partly]initialized.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c02d870..f6c4beb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4742,7 +4742,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return 1;
 
-	memcg->info.nodeinfo[node] = pn;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
@@ -4751,6 +4750,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 		mz->on_tree = false;
 		mz->mem = memcg;
 	}
+	memcg->info.nodeinfo[node] = pn;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 9b272977e3b99a8699361d214b51f98c8a9e0e7b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Wed, 2 Nov 2011 13:38:23 -0700
Subject: memcg: skip scanning active lists based on individual size

Reclaim decides to skip scanning an active list when the corresponding
inactive list is above a certain size in comparison to leave the assumed
working set alone while there are still enough reclaim candidates around.

The memcg implementation of comparing those lists instead reports whether
the whole memcg is low on the requested type of inactive pages,
considering all nodes and zones.

This can lead to an oversized active list not being scanned because of the
state of the other lists in the memcg, as well as an active list being
scanned while its corresponding inactive list has enough pages.

Not only is this wrong, it's also a scalability hazard, because the global
memory state over all nodes and zones has to be gathered for each memcg
and zone scanned.

Make these calculations purely based on the size of the two LRU lists
that are actually affected by the outcome of the decision.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 06eb6d9..cc0ebc5 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -418,7 +418,6 @@ total_unevictable	- sum of all children's "unevictable"
 
 # The following additional stats are dependent on CONFIG_DEBUG_VM.
 
-inactive_ratio		- VM internal parameter. (see mm/page_alloc.c)
 recent_rotated_anon	- VM internal parameter. (see mm/vmscan.c)
 recent_rotated_file	- VM internal parameter. (see mm/vmscan.c)
 recent_scanned_anon	- VM internal parameter. (see mm/vmscan.c)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 05206aa..b87068a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -106,8 +106,10 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
+				    struct zone *zone);
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg,
+				    struct zone *zone);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
 					int nid, int zid, unsigned int lrumask);
@@ -295,13 +297,13 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 static inline int
-mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	return 1;
 }
 
 static inline int
-mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	return 1;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f6c4beb..ce7b35d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1104,15 +1104,19 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 	return ret;
 }
 
-static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
-	unsigned long active;
+	unsigned long inactive_ratio;
+	int nid = zone_to_nid(zone);
+	int zid = zone_idx(zone);
 	unsigned long inactive;
+	unsigned long active;
 	unsigned long gb;
-	unsigned long inactive_ratio;
 
-	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
-	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
+	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+						BIT(LRU_INACTIVE_ANON));
+	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+					      BIT(LRU_ACTIVE_ANON));
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -1120,39 +1124,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
 	else
 		inactive_ratio = 1;
 
-	if (present_pages) {
-		present_pages[0] = inactive;
-		present_pages[1] = active;
-	}
-
-	return inactive_ratio;
+	return inactive * inactive_ratio < active;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-{
-	unsigned long active;
-	unsigned long inactive;
-	unsigned long present_pages[2];
-	unsigned long inactive_ratio;
-
-	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-
-	inactive = present_pages[0];
-	active = present_pages[1];
-
-	if (inactive * inactive_ratio < active)
-		return 1;
-
-	return 0;
-}
-
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
 	unsigned long active;
 	unsigned long inactive;
+	int zid = zone_idx(zone);
+	int nid = zone_to_nid(zone);
 
-	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
-	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
+	inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+						BIT(LRU_INACTIVE_FILE));
+	active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+					      BIT(LRU_ACTIVE_FILE));
 
 	return (active > inactive);
 }
@@ -4192,8 +4177,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	}
 
 #ifdef CONFIG_DEBUG_VM
-	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
-
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a90c603..132d1dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1767,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 	if (scanning_global_lru(sc))
 		low = inactive_anon_is_low_global(zone);
 	else
-		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+		low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
 	return low;
 }
 #else
@@ -1810,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
 	if (scanning_global_lru(sc))
 		low = inactive_file_is_low_global(zone);
 	else
-		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+		low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
 	return low;
 }
 
-- 
cgit v0.10.2


From a61ed3cec51cfd4877855c24890ab8d3e2b143e3 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Wed, 2 Nov 2011 13:38:29 -0700
Subject: memcg: close race between charge and putback

There is a potential race between a thread charging a page and another
thread putting it back to the LRU list:

  charge:                         putback:
  SetPageCgroupUsed               SetPageLRU
  PageLRU && add to memcg LRU     PageCgroupUsed && add to memcg LRU

The order of setting one flag and checking the other is crucial, otherwise
the charge may observe !PageLRU while the putback observes !PageCgroupUsed
and the page is not linked to the memcg LRU at all.

Global memory pressure may fix this by trying to isolate and putback the
page for reclaim, where that putback would link it to the memcg LRU again.
 Without that, the memory cgroup is undeletable due to a charge whose
physical page can not be found and moved out.

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Cc: Ying Han <yinghan@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce7b35d..01e0c72 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -993,6 +993,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 		return;
 	pc = lookup_page_cgroup(page);
 	VM_BUG_ON(PageCgroupAcctLRU(pc));
+	/*
+	 * putback:				charge:
+	 * SetPageLRU				SetPageCgroupUsed
+	 * smp_mb				smp_mb
+	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
+	 *
+	 * Ensure that one of the two sides adds the page to the memcg
+	 * LRU during a race.
+	 */
+	smp_mb();
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
@@ -1044,7 +1054,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
 	unsigned long flags;
 	struct zone *zone = page_zone(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
-
+	/*
+	 * putback:				charge:
+	 * SetPageLRU				SetPageCgroupUsed
+	 * smp_mb				smp_mb
+	 * PageCgroupUsed && add to memcg LRU	PageLRU && add to memcg LRU
+	 *
+	 * Ensure that one of the two sides adds the page to the memcg
+	 * LRU during a race.
+	 */
+	smp_mb();
 	/* taking care of that the page is added to LRU while we commit it */
 	if (likely(!PageLRU(page)))
 		return;
-- 
cgit v0.10.2


From 4799401fef9d5951b2da384c5eb08034c48e08a0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 2 Nov 2011 13:38:33 -0700
Subject: memcg: Fix race condition in memcg_check_events() with this_cpu usage

Various code in memcontrol.c () calls this_cpu_read() on the calculations
to be done from two different percpu variables, or does an open-coded
read-modify-write on a single percpu variable.

Disable preemption throughout these operations so that the writes go to
the correct palces.

[hannes@cmpxchg.org: added this_cpu to __this_cpu conversion]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 01e0c72..7af1d5e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -686,8 +686,8 @@ static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
 {
 	unsigned long val, next;
 
-	val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-	next = this_cpu_read(memcg->stat->targets[target]);
+	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	return ((long)next - (long)val < 0);
 }
@@ -696,7 +696,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
 {
 	unsigned long val, next;
 
-	val = this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+	val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 
 	switch (target) {
 	case MEM_CGROUP_TARGET_THRESH:
@@ -712,7 +712,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
 		return;
 	}
 
-	this_cpu_write(memcg->stat->targets[target], next);
+	__this_cpu_write(memcg->stat->targets[target], next);
 }
 
 /*
@@ -721,6 +721,7 @@ static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
+	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
 		mem_cgroup_threshold(memcg);
@@ -740,6 +741,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 		}
 #endif
 	}
+	preempt_enable();
 }
 
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
-- 
cgit v0.10.2


From 61600f578fbd2e8ad0c90bddb9c729e7628d3813 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Wed, 2 Nov 2011 13:38:36 -0700
Subject: mm/page_cgroup.c: quiet sparse noise

warning: symbol 'swap_cgroup_ctrl' was not declared. Should it be static?

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3749ae1..2d123f9 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -360,7 +360,7 @@ struct swap_cgroup_ctrl {
 	spinlock_t	lock;
 };
 
-struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 
 struct swap_cgroup {
 	unsigned short		id;
-- 
cgit v0.10.2


From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 2 Nov 2011 13:38:39 -0700
Subject: cpusets: avoid looping when storing to mems_allowed if one node
 remains set

{get,put}_mems_allowed() exist so that general kernel code may locklessly
access a task's set of allowable nodes without having the chance that a
concurrent write will cause the nodemask to be empty on configurations
where MAX_NUMNODES > BITS_PER_LONG.

This could incur a significant delay, however, especially in low memory
conditions because the page allocator is blocking and reclaim requires
get_mems_allowed() itself.  It is not atypical to see writes to
cpuset.mems take over 2 seconds to complete, for example.  In low memory
conditions, this is problematic because it's one of the most imporant
times to change cpuset.mems in the first place!

The only way a task's set of allowable nodes may change is through cpusets
by writing to cpuset.mems and when attaching a task to a generic code is
not reading the nodemask with get_mems_allowed() at the same time, and
then clearing all the old nodes.  This prevents the possibility that a
reader will see an empty nodemask at the same time the writer is storing a
new nodemask.

If at least one node remains unchanged, though, it's possible to simply
set all new nodes and then clear all the old nodes.  Changing a task's
nodemask is protected by cgroup_mutex so it's guaranteed that two threads
are not changing the same task's nodemask at the same time, so the
nodemask is guaranteed to be stored before another thread changes it and
determines whether a node remains set or not.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Paul Menage <paul@paulmenage.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fd..ed0ff44 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
+	bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
+
 repeat:
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -963,7 +965,6 @@ repeat:
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
 	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
 
-
 	/*
 	 * ensure checking ->mems_allowed_change_disable after setting all new
 	 * allowed nodes.
@@ -980,9 +981,11 @@ repeat:
 
 	/*
 	 * Allocation of memory is very fast, we needn't sleep when waiting
-	 * for the read-side.
+	 * for the read-side.  No wait is necessary, however, if at least one
+	 * node remains unchanged.
 	 */
-	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+	while (masks_disjoint &&
+			ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
 		task_unlock(tsk);
 		if (!task_curr(tsk))
 			yield();
-- 
cgit v0.10.2


From 887df07891de0435c25cffb92268fea2c621f99c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 2 Nov 2011 13:38:42 -0700
Subject: procfs: report EISDIR when reading sysctl dirs in proc

On reading sysctl dirs we should return -EISDIR instead of -EINVAL.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1a77dbe..dacd840 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -370,6 +370,7 @@ static const struct file_operations proc_sys_file_operations = {
 };
 
 static const struct file_operations proc_sys_dir_file_operations = {
+	.read		= generic_read_dir,
 	.readdir	= proc_sys_readdir,
 	.llseek		= generic_file_llseek,
 };
-- 
cgit v0.10.2


From aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Wed, 2 Nov 2011 13:38:44 -0700
Subject: proc: fix races against execve() of /proc/PID/fd**

fd* files are restricted to the task's owner, and other users may not get
direct access to them.  But one may open any of these files and run any
setuid program, keeping opened file descriptors.  As there are permission
checks on open(), but not on readdir() and read(), operations on the kept
file descriptors will not be checked.  It makes it possible to violate
procfs permission model.

Reading fdinfo/* may disclosure current fds' position and flags, reading
directory contents of fdinfo/ and fd/ may disclosure the number of opened
files by the target task.  This information is not sensible per se, but it
can reveal some private information (like length of a password stored in a
file) under certain conditions.

Used existing (un)lock_trace functions to check for ptrace_may_access(),
but instead of using EPERM return code from it use EACCES to be consistent
with existing proc_pid_follow_link()/proc_pid_readlink() return code.  If
they differ, attacker can guess what fds exist by analyzing stat() return
code.  Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
readdir() and lookup() for fd/ and fdinfo/.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8f0087e..d4f4913 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1652,12 +1652,46 @@ out:
 	return error;
 }
 
+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	int rc;
+
+	if (task == NULL)
+		return -ESRCH;
+
+	rc = -EACCES;
+	if (lock_trace(task))
+		goto out_task;
+
+	generic_fillattr(inode, stat);
+	unlock_trace(task);
+	rc = 0;
+out_task:
+	put_task_struct(task);
+	return rc;
+}
+
 static const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
 };
 
+static const struct inode_operations proc_fdinfo_link_inode_operations = {
+	.setattr	= proc_setattr,
+	.getattr	= proc_pid_fd_link_getattr,
+};
+
+static const struct inode_operations proc_fd_link_inode_operations = {
+	.readlink	= proc_pid_readlink,
+	.follow_link	= proc_pid_follow_link,
+	.setattr	= proc_setattr,
+	.getattr	= proc_pid_fd_link_getattr,
+};
+
 
 /* building an inode */
 
@@ -1889,49 +1923,61 @@ out:
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
-	struct task_struct *task = get_proc_task(inode);
-	struct files_struct *files = NULL;
+	struct task_struct *task;
+	struct files_struct *files;
 	struct file *file;
 	int fd = proc_fd(inode);
+	int rc;
 
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
-	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
-		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
-		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
-
-			fdt = files_fdtable(files);
-			f_flags = file->f_flags & ~O_CLOEXEC;
-			if (FD_ISSET(fd, fdt->close_on_exec))
-				f_flags |= O_CLOEXEC;
-
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
+	task = get_proc_task(inode);
+	if (!task)
+		return -ENOENT;
+
+	rc = -EACCES;
+	if (lock_trace(task))
+		goto out_task;
+
+	rc = -ENOENT;
+	files = get_files_struct(task);
+	if (files == NULL)
+		goto out_unlock;
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (file) {
+		unsigned int f_flags;
+		struct fdtable *fdt;
+
+		fdt = files_fdtable(files);
+		f_flags = file->f_flags & ~O_CLOEXEC;
+		if (FD_ISSET(fd, fdt->close_on_exec))
+			f_flags |= O_CLOEXEC;
+
+		if (path) {
+			*path = file->f_path;
+			path_get(&file->f_path);
 		}
-		spin_unlock(&files->file_lock);
-		put_files_struct(files);
-	}
-	return -ENOENT;
+		if (info)
+			snprintf(info, PROC_FDINFO_MAX,
+				 "pos:\t%lli\n"
+				 "flags:\t0%o\n",
+				 (long long) file->f_pos,
+				 f_flags);
+		rc = 0;
+	} else
+		rc = -ENOENT;
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+out_unlock:
+	unlock_trace(task);
+out_task:
+	put_task_struct(task);
+	return rc;
 }
 
 static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 
-	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_op = &proc_fd_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 	if (fd == ~0U)
 		goto out;
 
+	result = ERR_PTR(-EACCES);
+	if (lock_trace(task))
+		goto out;
+
 	result = instantiate(dir, dentry, task, &fd);
+	unlock_trace(task);
 out:
 	put_task_struct(task);
 out_no_task:
@@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 	retval = -ENOENT;
 	if (!p)
 		goto out_no_task;
+
+	retval = -EACCES;
+	if (lock_trace(p))
+		goto out;
+
 	retval = 0;
 
 	fd = filp->f_pos;
 	switch (fd) {
 		case 0:
 			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
+				goto out_unlock;
 			filp->f_pos++;
 		case 1:
 			ino = parent_ino(dentry);
 			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
+				goto out_unlock;
 			filp->f_pos++;
 		default:
 			files = get_files_struct(p);
 			if (!files)
-				goto out;
+				goto out_unlock;
 			rcu_read_lock();
 			for (fd = filp->f_pos-2;
 			     fd < files_fdtable(files)->max_fds;
@@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct file * filp, void * dirent,
 			rcu_read_unlock();
 			put_files_struct(files);
 	}
+
+out_unlock:
+	unlock_trace(p);
 out:
 	put_task_struct(p);
 out_no_task:
@@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
+	inode->i_op = &proc_fdinfo_link_inode_operations;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-- 
cgit v0.10.2


From 46cbc1d3981ee753518fbf9198a14f71a9f6841e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 2 Nov 2011 13:38:46 -0700
Subject: ida: make ida_simple_get/put() IRQ safe

It's often convenient to be able to release resource from IRQ context.
Make ida_simple_*() use irqsave/restore spin ops so that they are IRQ
safe.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/lib/idr.c b/lib/idr.c
index bbf211a..ed055b2 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -944,6 +944,7 @@ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
 {
 	int ret, id;
 	unsigned int max;
+	unsigned long flags;
 
 	BUG_ON((int)start < 0);
 	BUG_ON((int)end < 0);
@@ -959,7 +960,7 @@ again:
 	if (!ida_pre_get(ida, gfp_mask))
 		return -ENOMEM;
 
-	spin_lock(&simple_ida_lock);
+	spin_lock_irqsave(&simple_ida_lock, flags);
 	ret = ida_get_new_above(ida, start, &id);
 	if (!ret) {
 		if (id > max) {
@@ -969,7 +970,7 @@ again:
 			ret = id;
 		}
 	}
-	spin_unlock(&simple_ida_lock);
+	spin_unlock_irqrestore(&simple_ida_lock, flags);
 
 	if (unlikely(ret == -EAGAIN))
 		goto again;
@@ -985,10 +986,12 @@ EXPORT_SYMBOL(ida_simple_get);
  */
 void ida_simple_remove(struct ida *ida, unsigned int id)
 {
+	unsigned long flags;
+
 	BUG_ON((int)id < 0);
-	spin_lock(&simple_ida_lock);
+	spin_lock_irqsave(&simple_ida_lock, flags);
 	ida_remove(ida, id);
-	spin_unlock(&simple_ida_lock);
+	spin_unlock_irqrestore(&simple_ida_lock, flags);
 }
 EXPORT_SYMBOL(ida_simple_remove);
 
-- 
cgit v0.10.2


From 3c24783bb2deafaa106b7e69a97540071afc590c Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 2 Nov 2011 13:38:50 -0700
Subject: ipc/sem.c: fix return code race with semop vs. semop
 +semctl(IPC_RMID)

sys_semtimedop() may return -EIDRM although the semaphore operation
completed successfully:

thread 1:	thread 2:
		semtimedop(), sleeps
semop():
* acquires sem_lock()
		semtimedop() woken up due to timeout
		sem_lock() loops
* notices that thread 2 could be completed.
* performs the operations that thread 2 is sleeping on.
* marks the semaphore operation as IN_WAKEUP
* drops sem_lock(), does wakeup, sets return code to 0
		* thread delayed due to interrupt, whatever
* returns to user space
		* thread still delayed
semctl(IPC_RMID)
* acquires sem_lock()
* ipc_rmid(), ipcp->deleted=1
* drops sem_lock()
		* thread finally continues - but seem_lock()
		  now fails due to ipcp->deleted == 1
		* returns -EIDRM instead of 0

The fix is trivial: Always use the return code in queue.status.

In real world, the race probably doesn't matter:
If the semaphore array is destroyed, the app is probably not interested
if the last operation succeeded or was already cancelled.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index c8e00f8..fb13be1 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1460,7 +1460,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	 * Array removed? If yes, leave without sem_unlock().
 	 */
 	if (IS_ERR(sma)) {
-		error = -EIDRM;
 		goto out_free;
 	}
 
-- 
cgit v0.10.2


From 0b0577f6080c0645b079dcc03fdbaf40d928beb8 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 2 Nov 2011 13:38:52 -0700
Subject: ipc/sem.c: handle spurious wakeups

semtimedop() does not handle spurious wakeups, it returns -EINTR to user
space.  Most other schedule() users would just loop and not return to user
space.  The patch adds such a loop to semtimedop()

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/ipc/sem.c b/ipc/sem.c
index fb13be1..227948f 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1426,6 +1426,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 
 	queue.status = -EINTR;
 	queue.sleeper = current;
+
+sleep_again:
 	current->state = TASK_INTERRUPTIBLE;
 	sem_unlock(sma);
 
@@ -1478,6 +1480,13 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	 */
 	if (timeout && jiffies_left == 0)
 		error = -EAGAIN;
+
+	/*
+	 * If the wakeup was spurious, just retry
+	 */
+	if (error == -EINTR && !signal_pending(current))
+		goto sleep_again;
+
 	unlink_queue(sma, &queue);
 
 out_unlock_free:
-- 
cgit v0.10.2


From e57940d719e9fc5223d133b631f8cb5232d6064e Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 2 Nov 2011 13:38:54 -0700
Subject: ipc/sem.c: remove private structures from public header file

include/linux/sem.h contains several structures that are only used within
ipc/sem.c.

The patch moves them into ipc/sem.c - there is no need to expose the
structures to the whole kernel.

No functional changes, only whitespace cleanups and 80-char per line
fixes.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 1feb2de..4648426 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -83,13 +83,6 @@ struct  seminfo {
 
 struct task_struct;
 
-/* One semaphore structure for each semaphore in the system. */
-struct sem {
-	int	semval;		/* current value */
-	int	sempid;		/* pid of last operation */
-	struct list_head sem_pending; /* pending single-sop operations */
-};
-
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
 	struct kern_ipc_perm	____cacheline_aligned_in_smp
@@ -103,41 +96,6 @@ struct sem_array {
 	int			complex_count;	/* pending complex operations */
 };
 
-/* One queue for each sleeping process in the system. */
-struct sem_queue {
-	struct list_head	simple_list; /* queue of pending operations */
-	struct list_head	list;	 /* queue of pending operations */
-	struct task_struct	*sleeper; /* this process */
-	struct sem_undo		*undo;	 /* undo structure */
-	int    			pid;	 /* process id of requesting process */
-	int    			status;	 /* completion status of operation */
-	struct sembuf		*sops;	 /* array of pending operations */
-	int			nsops;	 /* number of operations */
-	int			alter;   /* does the operation alter the array? */
-};
-
-/* Each task has a list of undo requests. They are executed automatically
- * when the process exits.
- */
-struct sem_undo {
-	struct list_head	list_proc;	/* per-process list: all undos from one process. */
-						/* rcu protected */
-	struct rcu_head		rcu;		/* rcu struct for sem_undo() */
-	struct sem_undo_list	*ulp;		/* sem_undo_list for the process */
-	struct list_head	list_id;	/* per semaphore array list: all undos for one array */
-	int			semid;		/* semaphore set identifier */
-	short *			semadj;		/* array of adjustments, one per semaphore */
-};
-
-/* sem_undo_list controls shared access to the list of sem_undo structures
- * that may be shared among all a CLONE_SYSVSEM task group.
- */ 
-struct sem_undo_list {
-	atomic_t		refcnt;
-	spinlock_t		lock;
-	struct list_head	list_proc;
-};
-
 struct sysv_sem {
 	struct sem_undo_list *undo_list;
 };
diff --git a/ipc/sem.c b/ipc/sem.c
index 227948f..5215a81 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -90,6 +90,52 @@
 #include <asm/uaccess.h>
 #include "util.h"
 
+/* One semaphore structure for each semaphore in the system. */
+struct sem {
+	int	semval;		/* current value */
+	int	sempid;		/* pid of last operation */
+	struct list_head sem_pending; /* pending single-sop operations */
+};
+
+/* One queue for each sleeping process in the system. */
+struct sem_queue {
+	struct list_head	simple_list; /* queue of pending operations */
+	struct list_head	list;	 /* queue of pending operations */
+	struct task_struct	*sleeper; /* this process */
+	struct sem_undo		*undo;	 /* undo structure */
+	int			pid;	 /* process id of requesting process */
+	int			status;	 /* completion status of operation */
+	struct sembuf		*sops;	 /* array of pending operations */
+	int			nsops;	 /* number of operations */
+	int			alter;	 /* does *sops alter the array? */
+};
+
+/* Each task has a list of undo requests. They are executed automatically
+ * when the process exits.
+ */
+struct sem_undo {
+	struct list_head	list_proc;	/* per-process list: *
+						 * all undos from one process
+						 * rcu protected */
+	struct rcu_head		rcu;		/* rcu struct for sem_undo */
+	struct sem_undo_list	*ulp;		/* back ptr to sem_undo_list */
+	struct list_head	list_id;	/* per semaphore array list:
+						 * all undos for one array */
+	int			semid;		/* semaphore set identifier */
+	short			*semadj;	/* array of adjustments */
+						/* one per semaphore */
+};
+
+/* sem_undo_list controls shared access to the list of sem_undo structures
+ * that may be shared among all a CLONE_SYSVSEM task group.
+ */
+struct sem_undo_list {
+	atomic_t		refcnt;
+	spinlock_t		lock;
+	struct list_head	list_proc;
+};
+
+
 #define sem_ids(ns)	((ns)->ids[IPC_SEM_IDS])
 
 #define sem_unlock(sma)		ipc_unlock(&(sma)->sem_perm)
-- 
cgit v0.10.2


From f567a18590742b811287b7512fb0908deac4eef7 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 2 Nov 2011 13:38:56 -0700
Subject: include/linux/sem.h: make sysv_sem empty if SYSVIPC is disabled

For the sysvsem undo, each task struct contains a sysv_sem structure with
a pointer to the undo information.

This pointer is only necessary if sysvipc is enabled - thus the pointer
can be made conditional on CONFIG_SYSVIPC.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 4648426..10d6b22 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -96,16 +96,21 @@ struct sem_array {
 	int			complex_count;	/* pending complex operations */
 };
 
+#ifdef CONFIG_SYSVIPC
+
 struct sysv_sem {
 	struct sem_undo_list *undo_list;
 };
 
-#ifdef CONFIG_SYSVIPC
-
 extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
 extern void exit_sem(struct task_struct *tsk);
 
 #else
+
+struct sysv_sem {
+	/* empty */
+};
+
 static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
 {
 	return 0;
-- 
cgit v0.10.2


From 79975f1327850ef198ada994c2fc44b7d1ea8935 Mon Sep 17 00:00:00 2001
From: Will Drewry <wad@chromium.org>
Date: Wed, 2 Nov 2011 13:38:59 -0700
Subject: init: add root=PARTUUID=UUID/PARTNROFF=%d support

Expand root=PARTUUID=UUID syntax to support selecting a root partition by
integer offset from a known, unique partition.  This approach provides
similar properties to specifying a device and partition number, but using
the UUID as the unique path prior to evaluating the offset.

For example,
  root=PARTUUID=99DE9194-FC15-4223-9192-FC243948F88B/PARTNROFF=1
selects the partition with UUID 99DE.. then select the next
partition.

This change is motivated by a particular usecase in Chromium OS where the
bootloader can easily determine what partition it is on (by UUID) but
doesn't perform general partition table walking.

That said, support for this model provides a direct mechanism for the user
to modify the root partition to boot without specifically needing to
extract each UUID or update the bootloader explicitly when the root
partition UUID is changed (if it is recreated to be larger, for instance).
 Pinning to a /boot-style partition UUID allows the arbitrary root
partition reconfiguration/modifications with slightly less ambiguity than
just [dev][partition] and less stringency than the specific root partition
UUID.

[sfr@canb.auug.org.au: fix init sections warning]
Signed-off-by: Will Drewry <wad@chromium.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/init/do_mounts.c b/init/do_mounts.c
index c0851a8..0f6e1d9 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -28,7 +28,7 @@ int __initdata rd_doload;	/* 1 = load RAM disk, 0 = don't load */
 int root_mountflags = MS_RDONLY | MS_SILENT;
 static char * __initdata root_device_name;
 static char __initdata saved_root_name[64];
-static int __initdata root_wait;
+static int root_wait;
 
 dev_t ROOT_DEV;
 
@@ -85,12 +85,15 @@ no_match:
 
 /**
  * devt_from_partuuid - looks up the dev_t of a partition by its UUID
- * @uuid:	36 byte char array containing a hex ascii UUID
+ * @uuid:	min 36 byte char array containing a hex ascii UUID
  *
  * The function will return the first partition which contains a matching
  * UUID value in its partition_meta_info struct.  This does not search
  * by filesystem UUIDs.
  *
+ * If @uuid is followed by a "/PARTNROFF=%d", then the number will be
+ * extracted and used as an offset from the partition identified by the UUID.
+ *
  * Returns the matching dev_t on success or 0 on failure.
  */
 static dev_t devt_from_partuuid(char *uuid_str)
@@ -98,6 +101,28 @@ static dev_t devt_from_partuuid(char *uuid_str)
 	dev_t res = 0;
 	struct device *dev = NULL;
 	u8 uuid[16];
+	struct gendisk *disk;
+	struct hd_struct *part;
+	int offset = 0;
+
+	if (strlen(uuid_str) < 36)
+		goto done;
+
+	/* Check for optional partition number offset attributes. */
+	if (uuid_str[36]) {
+		char c = 0;
+		/* Explicitly fail on poor PARTUUID syntax. */
+		if (sscanf(&uuid_str[36],
+			   "/PARTNROFF=%d%c", &offset, &c) != 1) {
+			printk(KERN_ERR "VFS: PARTUUID= is invalid.\n"
+			 "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
+			if (root_wait)
+				printk(KERN_ERR
+				     "Disabling rootwait; root= is invalid.\n");
+			root_wait = 0;
+			goto done;
+		}
+	}
 
 	/* Pack the requested UUID in the expected format. */
 	part_pack_uuid(uuid_str, uuid);
@@ -107,8 +132,21 @@ static dev_t devt_from_partuuid(char *uuid_str)
 		goto done;
 
 	res = dev->devt;
-	put_device(dev);
 
+	/* Attempt to find the partition by offset. */
+	if (!offset)
+		goto no_offset;
+
+	res = 0;
+	disk = part_to_disk(dev_to_part(dev));
+	part = disk_get_part(disk, dev_to_part(dev)->partno + offset);
+	if (part) {
+		res = part_devt(part);
+		put_device(part_to_dev(part));
+	}
+
+no_offset:
+	put_device(dev);
 done:
 	return res;
 }
@@ -126,6 +164,8 @@ done:
  *	   used when disk name of partitioned disk ends on a digit.
  *	6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
  *	   unique id of a partition if the partition table provides it.
+ *	7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
+ *	   a partition with a known unique id.
  *
  *	If name doesn't have fall into the categories above, we return (0,0).
  *	block_class is used to check if something is a disk name. If the disk
@@ -143,8 +183,6 @@ dev_t name_to_dev_t(char *name)
 #ifdef CONFIG_BLOCK
 	if (strncmp(name, "PARTUUID=", 9) == 0) {
 		name += 9;
-		if (strlen(name) != 36)
-			goto fail;
 		res = devt_from_partuuid(name);
 		if (!res)
 			goto fail;
-- 
cgit v0.10.2


From a571259f4874023306db36e83054d093833b1902 Mon Sep 17 00:00:00 2001
From: Liu Gang <b34182@freescale.com>
Date: Wed, 2 Nov 2011 13:39:05 -0700
Subject: drivers/rapidio/rio-scan.c: use discovered bit to test if enumeration
 is complete

The discovered bit in PGCCSR register indicates if the device has been
discovered by system host.  In Rapidio systems, some agent devices can also
be master devices.  They can issue requests into the system.

Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index ebe77dd..0914f49 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -923,7 +923,7 @@ static int __devinit rio_enum_peer(struct rio_net *net, struct rio_mport *port,
  * rio_enum_complete- Tests if enumeration of a network is complete
  * @port: Master port to send transaction
  *
- * Tests the Component Tag CSR for non-zero value (enumeration
+ * Tests the PGCCSR discovered bit for non-zero value (enumeration
  * complete flag). Return %1 if enumeration is complete or %0 if
  * enumeration is incomplete.
  */
@@ -933,7 +933,7 @@ static int rio_enum_complete(struct rio_mport *port)
 
 	rio_local_read_config_32(port, port->phys_efptr + RIO_PORT_GEN_CTL_CSR,
 				 &regval);
-	return (regval & RIO_PORT_GEN_MASTER) ? 1 : 0;
+	return (regval & RIO_PORT_GEN_DISCOVERED) ? 1 : 0;
 }
 
 /**
-- 
cgit v0.10.2


From e80dd9a7bca4057d5a09d1ba94a7ba0791e7426a Mon Sep 17 00:00:00 2001
From: Liu Gang <Gang.Liu@freescale.com>
Date: Wed, 2 Nov 2011 13:39:07 -0700
Subject: arch/powerpc/sysdev/fsl_rio.c: release rapidio port I/O region
 resource if port failed to initialize

The "struct rio_mport" contains a member of master port I/O memory
resource structure "struct resource iores".  This resource will be read
from device tree and be used for rapidio R/W transaction memory space.
Rapidio requests the port I/O memory resource under the root resource
"iomem_resource".

			struct rio_mport *port;
			port = kzalloc(sizeof(struct rio_mport), GFP_KERNEL);

			request_resource(&iomem_resource, &port->iores);

When port failed to initialize, allocated "rio_mport" structure memory
will be freed, and the port I/O memory resource structure pointer
"&port->iores" will be invalid.  If other requests resource under
"iomem_resource", "&port->iores" node may be operated in the child
resources list and this will cause the system to crash.

So the requested port I/O memory resource should be released before
freeing allocated "rio_mport" structure.

Signed-off-by: Liu Gang <Gang.Liu@freescale.com>
Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c
index c65f75a..22ffccd 100644
--- a/arch/powerpc/sysdev/fsl_rio.c
+++ b/arch/powerpc/sysdev/fsl_rio.c
@@ -1608,6 +1608,7 @@ int fsl_rio_setup(struct platform_device *dev)
 	return 0;
 err:
 	iounmap(priv->regs_win);
+	release_resource(&port->iores);
 err_res:
 	kfree(priv);
 err_priv:
-- 
cgit v0.10.2


From 48618fb4e522d9d02e217ac05f52749545c1af20 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 2 Nov 2011 13:39:09 -0700
Subject: RapidIO: add mport driver for Tsi721 bridge

Add RapidIO mport driver for IDT TSI721 PCI Express-to-SRIO bridge device.
 The driver provides full set of callback functions defined for mport
devices in RapidIO subsystem.  It also is compatible with current version
of RIONET driver (Ethernet over RapidIO messaging services).

This patch is applicable to kernel versions starting from 2.6.39.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Signed-off-by: Chul Kim <chul.kim@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt
new file mode 100644
index 0000000..335f3c6
--- /dev/null
+++ b/Documentation/rapidio/tsi721.txt
@@ -0,0 +1,49 @@
+RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge.
+=========================================================================
+
+I. Overview
+
+This driver implements all currently defined RapidIO mport callback functions.
+It supports maintenance read and write operations, inbound and outbound RapidIO
+doorbells, inbound maintenance port-writes and RapidIO messaging.
+
+To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA
+channels. This mechanism provides access to larger range of hop counts and
+destination IDs without need for changes in outbound window translation.
+
+RapidIO messaging support uses dedicated messaging channels for each mailbox.
+For inbound messages this driver uses destination ID matching to forward messages
+into the corresponding message queue. Messaging callbacks are implemented to be
+fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
+
+II. Known problems
+
+  None.
+
+III. To do
+
+ Add DMA data transfers (non-messaging).
+ Add inbound region (SRIO-to-PCIe) mapping.
+
+IV. Version History
+
+  1.0.0 - Initial driver release.
+
+V.  License
+-----------------------------------------------
+
+  Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation; either version 2 of the License, or (at your option)
+  any later version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index 070211a..bc87192 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -1,6 +1,8 @@
 #
 # RapidIO configuration
 #
+source "drivers/rapidio/devices/Kconfig"
+
 config RAPIDIO_DISC_TIMEOUT
 	int "Discovery timeout duration (seconds)"
 	depends on RAPIDIO
@@ -20,8 +22,6 @@ config RAPIDIO_ENABLE_RX_TX_PORTS
 	  ports for Input/Output direction to allow other traffic
 	  than Maintenance transfers.
 
-source "drivers/rapidio/switches/Kconfig"
-
 config RAPIDIO_DEBUG
 	bool "RapidIO subsystem debug messages"
 	depends on RAPIDIO
@@ -32,3 +32,5 @@ config RAPIDIO_DEBUG
 	  going on.
 
 	  If you are unsure about this, say N here.
+
+source "drivers/rapidio/switches/Kconfig"
diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile
index 89b8eca..ec3fb81 100644
--- a/drivers/rapidio/Makefile
+++ b/drivers/rapidio/Makefile
@@ -4,5 +4,6 @@
 obj-y += rio.o rio-access.o rio-driver.o rio-scan.o rio-sysfs.o
 
 obj-$(CONFIG_RAPIDIO)		+= switches/
+obj-$(CONFIG_RAPIDIO)		+= devices/
 
 subdir-ccflags-$(CONFIG_RAPIDIO_DEBUG) := -DDEBUG
diff --git a/drivers/rapidio/devices/Kconfig b/drivers/rapidio/devices/Kconfig
new file mode 100644
index 0000000..12a9d7f
--- /dev/null
+++ b/drivers/rapidio/devices/Kconfig
@@ -0,0 +1,10 @@
+#
+# RapidIO master port configuration
+#
+
+config RAPIDIO_TSI721
+	bool "IDT Tsi721 PCI Express SRIO Controller support"
+	depends on RAPIDIO && PCIEPORTBUS
+	default "n"
+	---help---
+	  Include support for IDT Tsi721 PCI Express Serial RapidIO controller.
diff --git a/drivers/rapidio/devices/Makefile b/drivers/rapidio/devices/Makefile
new file mode 100644
index 0000000..3b7b4e2
--- /dev/null
+++ b/drivers/rapidio/devices/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for RapidIO devices
+#
+
+obj-$(CONFIG_RAPIDIO_TSI721)	+= tsi721.o
diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
new file mode 100644
index 0000000..5225930
--- /dev/null
+++ b/drivers/rapidio/devices/tsi721.c
@@ -0,0 +1,2360 @@
+/*
+ * RapidIO mport driver for Tsi721 PCIExpress-to-SRIO bridge
+ *
+ * Copyright 2011 Integrated Device Technology, Inc.
+ * Alexandre Bounine <alexandre.bounine@idt.com>
+ * Chul Kim <chul.kim@idt.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#include <linux/io.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/rio.h>
+#include <linux/rio_drv.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/kfifo.h>
+#include <linux/delay.h>
+
+#include "tsi721.h"
+
+#define DEBUG_PW	/* Inbound Port-Write debugging */
+
+static void tsi721_omsg_handler(struct tsi721_device *priv, int ch);
+static void tsi721_imsg_handler(struct tsi721_device *priv, int ch);
+
+/**
+ * tsi721_lcread - read from local SREP config space
+ * @mport: RapidIO master port info
+ * @index: ID of RapdiIO interface
+ * @offset: Offset into configuration space
+ * @len: Length (in bytes) of the maintenance transaction
+ * @data: Value to be read into
+ *
+ * Generates a local SREP space read. Returns %0 on
+ * success or %-EINVAL on failure.
+ */
+static int tsi721_lcread(struct rio_mport *mport, int index, u32 offset,
+			 int len, u32 *data)
+{
+	struct tsi721_device *priv = mport->priv;
+
+	if (len != sizeof(u32))
+		return -EINVAL; /* only 32-bit access is supported */
+
+	*data = ioread32(priv->regs + offset);
+
+	return 0;
+}
+
+/**
+ * tsi721_lcwrite - write into local SREP config space
+ * @mport: RapidIO master port info
+ * @index: ID of RapdiIO interface
+ * @offset: Offset into configuration space
+ * @len: Length (in bytes) of the maintenance transaction
+ * @data: Value to be written
+ *
+ * Generates a local write into SREP configuration space. Returns %0 on
+ * success or %-EINVAL on failure.
+ */
+static int tsi721_lcwrite(struct rio_mport *mport, int index, u32 offset,
+			  int len, u32 data)
+{
+	struct tsi721_device *priv = mport->priv;
+
+	if (len != sizeof(u32))
+		return -EINVAL; /* only 32-bit access is supported */
+
+	iowrite32(data, priv->regs + offset);
+
+	return 0;
+}
+
+/**
+ * tsi721_maint_dma - Helper function to generate RapidIO maintenance
+ *                    transactions using designated Tsi721 DMA channel.
+ * @priv: pointer to tsi721 private data
+ * @sys_size: RapdiIO transport system size
+ * @destid: Destination ID of transaction
+ * @hopcount: Number of hops to target device
+ * @offset: Offset into configuration space
+ * @len: Length (in bytes) of the maintenance transaction
+ * @data: Location to be read from or write into
+ * @do_wr: Operation flag (1 == MAINT_WR)
+ *
+ * Generates a RapidIO maintenance transaction (Read or Write).
+ * Returns %0 on success and %-EINVAL or %-EFAULT on failure.
+ */
+static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size,
+			u16 destid, u8 hopcount, u32 offset, int len,
+			u32 *data, int do_wr)
+{
+	struct tsi721_dma_desc *bd_ptr;
+	u32 rd_count, swr_ptr, ch_stat;
+	int i, err = 0;
+	u32 op = do_wr ? MAINT_WR : MAINT_RD;
+
+	if (offset > (RIO_MAINT_SPACE_SZ - len) || (len != sizeof(u32)))
+		return -EINVAL;
+
+	bd_ptr = priv->bdma[TSI721_DMACH_MAINT].bd_base;
+
+	rd_count = ioread32(
+			priv->regs + TSI721_DMAC_DRDCNT(TSI721_DMACH_MAINT));
+
+	/* Initialize DMA descriptor */
+	bd_ptr[0].type_id = cpu_to_le32((DTYPE2 << 29) | (op << 19) | destid);
+	bd_ptr[0].bcount = cpu_to_le32((sys_size << 26) | 0x04);
+	bd_ptr[0].raddr_lo = cpu_to_le32((hopcount << 24) | offset);
+	bd_ptr[0].raddr_hi = 0;
+	if (do_wr)
+		bd_ptr[0].data[0] = cpu_to_be32p(data);
+	else
+		bd_ptr[0].data[0] = 0xffffffff;
+
+	mb();
+
+	/* Start DMA operation */
+	iowrite32(rd_count + 2,
+		priv->regs + TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT));
+	ioread32(priv->regs + TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT));
+	i = 0;
+
+	/* Wait until DMA transfer is finished */
+	while ((ch_stat = ioread32(priv->regs +
+		TSI721_DMAC_STS(TSI721_DMACH_MAINT))) & TSI721_DMAC_STS_RUN) {
+		udelay(1);
+		if (++i >= 5000000) {
+			dev_dbg(&priv->pdev->dev,
+				"%s : DMA[%d] read timeout ch_status=%x\n",
+				__func__, TSI721_DMACH_MAINT, ch_stat);
+			if (!do_wr)
+				*data = 0xffffffff;
+			err = -EIO;
+			goto err_out;
+		}
+	}
+
+	if (ch_stat & TSI721_DMAC_STS_ABORT) {
+		/* If DMA operation aborted due to error,
+		 * reinitialize DMA channel
+		 */
+		dev_dbg(&priv->pdev->dev, "%s : DMA ABORT ch_stat=%x\n",
+			__func__, ch_stat);
+		dev_dbg(&priv->pdev->dev, "OP=%d : destid=%x hc=%x off=%x\n",
+			do_wr ? MAINT_WR : MAINT_RD, destid, hopcount, offset);
+		iowrite32(TSI721_DMAC_INT_ALL,
+			priv->regs + TSI721_DMAC_INT(TSI721_DMACH_MAINT));
+		iowrite32(TSI721_DMAC_CTL_INIT,
+			priv->regs + TSI721_DMAC_CTL(TSI721_DMACH_MAINT));
+		udelay(10);
+		iowrite32(0, priv->regs +
+				TSI721_DMAC_DWRCNT(TSI721_DMACH_MAINT));
+		udelay(1);
+		if (!do_wr)
+			*data = 0xffffffff;
+		err = -EIO;
+		goto err_out;
+	}
+
+	if (!do_wr)
+		*data = be32_to_cpu(bd_ptr[0].data[0]);
+
+	/*
+	 * Update descriptor status FIFO RD pointer.
+	 * NOTE: Skipping check and clear FIFO entries because we are waiting
+	 * for transfer to be completed.
+	 */
+	swr_ptr = ioread32(priv->regs + TSI721_DMAC_DSWP(TSI721_DMACH_MAINT));
+	iowrite32(swr_ptr, priv->regs + TSI721_DMAC_DSRP(TSI721_DMACH_MAINT));
+err_out:
+
+	return err;
+}
+
+/**
+ * tsi721_cread_dma - Generate a RapidIO maintenance read transaction
+ *                    using Tsi721 BDMA engine.
+ * @mport: RapidIO master port control structure
+ * @index: ID of RapdiIO interface
+ * @destid: Destination ID of transaction
+ * @hopcount: Number of hops to target device
+ * @offset: Offset into configuration space
+ * @len: Length (in bytes) of the maintenance transaction
+ * @val: Location to be read into
+ *
+ * Generates a RapidIO maintenance read transaction.
+ * Returns %0 on success and %-EINVAL or %-EFAULT on failure.
+ */
+static int tsi721_cread_dma(struct rio_mport *mport, int index, u16 destid,
+			u8 hopcount, u32 offset, int len, u32 *data)
+{
+	struct tsi721_device *priv = mport->priv;
+
+	return tsi721_maint_dma(priv, mport->sys_size, destid, hopcount,
+				offset, len, data, 0);
+}
+
+/**
+ * tsi721_cwrite_dma - Generate a RapidIO maintenance write transaction
+ *                     using Tsi721 BDMA engine
+ * @mport: RapidIO master port control structure
+ * @index: ID of RapdiIO interface
+ * @destid: Destination ID of transaction
+ * @hopcount: Number of hops to target device
+ * @offset: Offset into configuration space
+ * @len: Length (in bytes) of the maintenance transaction
+ * @val: Value to be written
+ *
+ * Generates a RapidIO maintenance write transaction.
+ * Returns %0 on success and %-EINVAL or %-EFAULT on failure.
+ */
+static int tsi721_cwrite_dma(struct rio_mport *mport, int index, u16 destid,
+			 u8 hopcount, u32 offset, int len, u32 data)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 temp = data;
+
+	return tsi721_maint_dma(priv, mport->sys_size, destid, hopcount,
+				offset, len, &temp, 1);
+}
+
+/**
+ * tsi721_pw_handler - Tsi721 inbound port-write interrupt handler
+ * @mport: RapidIO master port structure
+ *
+ * Handles inbound port-write interrupts. Copies PW message from an internal
+ * buffer into PW message FIFO and schedules deferred routine to process
+ * queued messages.
+ */
+static int
+tsi721_pw_handler(struct rio_mport *mport)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 pw_stat;
+	u32 pw_buf[TSI721_RIO_PW_MSG_SIZE/sizeof(u32)];
+
+
+	pw_stat = ioread32(priv->regs + TSI721_RIO_PW_RX_STAT);
+
+	if (pw_stat & TSI721_RIO_PW_RX_STAT_PW_VAL) {
+		pw_buf[0] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(0));
+		pw_buf[1] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(1));
+		pw_buf[2] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(2));
+		pw_buf[3] = ioread32(priv->regs + TSI721_RIO_PW_RX_CAPT(3));
+
+		/* Queue PW message (if there is room in FIFO),
+		 * otherwise discard it.
+		 */
+		spin_lock(&priv->pw_fifo_lock);
+		if (kfifo_avail(&priv->pw_fifo) >= TSI721_RIO_PW_MSG_SIZE)
+			kfifo_in(&priv->pw_fifo, pw_buf,
+						TSI721_RIO_PW_MSG_SIZE);
+		else
+			priv->pw_discard_count++;
+		spin_unlock(&priv->pw_fifo_lock);
+	}
+
+	/* Clear pending PW interrupts */
+	iowrite32(TSI721_RIO_PW_RX_STAT_PW_DISC | TSI721_RIO_PW_RX_STAT_PW_VAL,
+		  priv->regs + TSI721_RIO_PW_RX_STAT);
+
+	schedule_work(&priv->pw_work);
+
+	return 0;
+}
+
+static void tsi721_pw_dpc(struct work_struct *work)
+{
+	struct tsi721_device *priv = container_of(work, struct tsi721_device,
+						    pw_work);
+	u32 msg_buffer[RIO_PW_MSG_SIZE/sizeof(u32)]; /* Use full size PW message
+							buffer for RIO layer */
+
+	/*
+	 * Process port-write messages
+	 */
+	while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)msg_buffer,
+			 TSI721_RIO_PW_MSG_SIZE, &priv->pw_fifo_lock)) {
+		/* Process one message */
+#ifdef DEBUG_PW
+		{
+		u32 i;
+		pr_debug("%s : Port-Write Message:", __func__);
+		for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32); ) {
+			pr_debug("0x%02x: %08x %08x %08x %08x", i*4,
+				msg_buffer[i], msg_buffer[i + 1],
+				msg_buffer[i + 2], msg_buffer[i + 3]);
+			i += 4;
+		}
+		pr_debug("\n");
+		}
+#endif
+		/* Pass the port-write message to RIO core for processing */
+		rio_inb_pwrite_handler((union rio_pw_msg *)msg_buffer);
+	}
+}
+
+/**
+ * tsi721_pw_enable - enable/disable port-write interface init
+ * @mport: Master port implementing the port write unit
+ * @enable:    1=enable; 0=disable port-write message handling
+ */
+static int tsi721_pw_enable(struct rio_mport *mport, int enable)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 rval;
+
+	rval = ioread32(priv->regs + TSI721_RIO_EM_INT_ENABLE);
+
+	if (enable)
+		rval |= TSI721_RIO_EM_INT_ENABLE_PW_RX;
+	else
+		rval &= ~TSI721_RIO_EM_INT_ENABLE_PW_RX;
+
+	/* Clear pending PW interrupts */
+	iowrite32(TSI721_RIO_PW_RX_STAT_PW_DISC | TSI721_RIO_PW_RX_STAT_PW_VAL,
+		  priv->regs + TSI721_RIO_PW_RX_STAT);
+	/* Update enable bits */
+	iowrite32(rval, priv->regs + TSI721_RIO_EM_INT_ENABLE);
+
+	return 0;
+}
+
+/**
+ * tsi721_dsend - Send a RapidIO doorbell
+ * @mport: RapidIO master port info
+ * @index: ID of RapidIO interface
+ * @destid: Destination ID of target device
+ * @data: 16-bit info field of RapidIO doorbell
+ *
+ * Sends a RapidIO doorbell message. Always returns %0.
+ */
+static int tsi721_dsend(struct rio_mport *mport, int index,
+			u16 destid, u16 data)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 offset;
+
+	offset = (((mport->sys_size) ? RIO_TT_CODE_16 : RIO_TT_CODE_8) << 18) |
+		 (destid << 2);
+
+	dev_dbg(&priv->pdev->dev,
+		"Send Doorbell 0x%04x to destID 0x%x\n", data, destid);
+	iowrite16be(data, priv->odb_base + offset);
+
+	return 0;
+}
+
+/**
+ * tsi721_dbell_handler - Tsi721 doorbell interrupt handler
+ * @mport: RapidIO master port structure
+ *
+ * Handles inbound doorbell interrupts. Copies doorbell entry from an internal
+ * buffer into DB message FIFO and schedules deferred  routine to process
+ * queued DBs.
+ */
+static int
+tsi721_dbell_handler(struct rio_mport *mport)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 regval;
+
+	/* Disable IDB interrupts */
+	regval = ioread32(priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+	regval &= ~TSI721_SR_CHINT_IDBQRCV;
+	iowrite32(regval,
+		priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+
+	schedule_work(&priv->idb_work);
+
+	return 0;
+}
+
+static void tsi721_db_dpc(struct work_struct *work)
+{
+	struct tsi721_device *priv = container_of(work, struct tsi721_device,
+						    idb_work);
+	struct rio_mport *mport;
+	struct rio_dbell *dbell;
+	int found = 0;
+	u32 wr_ptr, rd_ptr;
+	u64 *idb_entry;
+	u32 regval;
+	union {
+		u64 msg;
+		u8  bytes[8];
+	} idb;
+
+	/*
+	 * Process queued inbound doorbells
+	 */
+	mport = priv->mport;
+
+	wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE));
+	rd_ptr = ioread32(priv->regs + TSI721_IDQ_RP(IDB_QUEUE));
+
+	while (wr_ptr != rd_ptr) {
+		idb_entry = (u64 *)(priv->idb_base +
+					(TSI721_IDB_ENTRY_SIZE * rd_ptr));
+		rd_ptr++;
+		idb.msg = *idb_entry;
+		*idb_entry = 0;
+
+		/* Process one doorbell */
+		list_for_each_entry(dbell, &mport->dbells, node) {
+			if ((dbell->res->start <= DBELL_INF(idb.bytes)) &&
+			    (dbell->res->end >= DBELL_INF(idb.bytes))) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (found) {
+			dbell->dinb(mport, dbell->dev_id, DBELL_SID(idb.bytes),
+				    DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
+		} else {
+			dev_dbg(&priv->pdev->dev,
+				"spurious inb doorbell, sid %2.2x tid %2.2x"
+				" info %4.4x\n", DBELL_SID(idb.bytes),
+				DBELL_TID(idb.bytes), DBELL_INF(idb.bytes));
+		}
+	}
+
+	iowrite32(rd_ptr & (IDB_QSIZE - 1),
+		priv->regs + TSI721_IDQ_RP(IDB_QUEUE));
+
+	/* Re-enable IDB interrupts */
+	regval = ioread32(priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+	regval |= TSI721_SR_CHINT_IDBQRCV;
+	iowrite32(regval,
+		priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+}
+
+/**
+ * tsi721_irqhandler - Tsi721 interrupt handler
+ * @irq: Linux interrupt number
+ * @ptr: Pointer to interrupt-specific data (mport structure)
+ *
+ * Handles Tsi721 interrupts signaled using MSI and INTA. Checks reported
+ * interrupt events and calls an event-specific handler(s).
+ */
+static irqreturn_t tsi721_irqhandler(int irq, void *ptr)
+{
+	struct rio_mport *mport = (struct rio_mport *)ptr;
+	struct tsi721_device *priv = mport->priv;
+	u32 dev_int;
+	u32 dev_ch_int;
+	u32 intval;
+	u32 ch_inte;
+
+	dev_int = ioread32(priv->regs + TSI721_DEV_INT);
+	if (!dev_int)
+		return IRQ_NONE;
+
+	dev_ch_int = ioread32(priv->regs + TSI721_DEV_CHAN_INT);
+
+	if (dev_int & TSI721_DEV_INT_SR2PC_CH) {
+		/* Service SR2PC Channel interrupts */
+		if (dev_ch_int & TSI721_INT_SR2PC_CHAN(IDB_QUEUE)) {
+			/* Service Inbound Doorbell interrupt */
+			intval = ioread32(priv->regs +
+						TSI721_SR_CHINT(IDB_QUEUE));
+			if (intval & TSI721_SR_CHINT_IDBQRCV)
+				tsi721_dbell_handler(mport);
+			else
+				dev_info(&priv->pdev->dev,
+					"Unsupported SR_CH_INT %x\n", intval);
+
+			/* Clear interrupts */
+			iowrite32(intval,
+				priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+			ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+		}
+	}
+
+	if (dev_int & TSI721_DEV_INT_SMSG_CH) {
+		int ch;
+
+		/*
+		 * Service channel interrupts from Messaging Engine
+		 */
+
+		if (dev_ch_int & TSI721_INT_IMSG_CHAN_M) { /* Inbound Msg */
+			/* Disable signaled OB MSG Channel interrupts */
+			ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+			ch_inte &= ~(dev_ch_int & TSI721_INT_IMSG_CHAN_M);
+			iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE);
+
+			/*
+			 * Process Inbound Message interrupt for each MBOX
+			 */
+			for (ch = 4; ch < RIO_MAX_MBOX + 4; ch++) {
+				if (!(dev_ch_int & TSI721_INT_IMSG_CHAN(ch)))
+					continue;
+				tsi721_imsg_handler(priv, ch);
+			}
+		}
+
+		if (dev_ch_int & TSI721_INT_OMSG_CHAN_M) { /* Outbound Msg */
+			/* Disable signaled OB MSG Channel interrupts */
+			ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+			ch_inte &= ~(dev_ch_int & TSI721_INT_OMSG_CHAN_M);
+			iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE);
+
+			/*
+			 * Process Outbound Message interrupts for each MBOX
+			 */
+
+			for (ch = 0; ch < RIO_MAX_MBOX; ch++) {
+				if (!(dev_ch_int & TSI721_INT_OMSG_CHAN(ch)))
+					continue;
+				tsi721_omsg_handler(priv, ch);
+			}
+		}
+	}
+
+	if (dev_int & TSI721_DEV_INT_SRIO) {
+		/* Service SRIO MAC interrupts */
+		intval = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT);
+		if (intval & TSI721_RIO_EM_INT_STAT_PW_RX)
+			tsi721_pw_handler(mport);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void tsi721_interrupts_init(struct tsi721_device *priv)
+{
+	u32 intr;
+
+	/* Enable IDB interrupts */
+	iowrite32(TSI721_SR_CHINT_ALL,
+		priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+	iowrite32(TSI721_SR_CHINT_IDBQRCV,
+		priv->regs + TSI721_SR_CHINTE(IDB_QUEUE));
+	iowrite32(TSI721_INT_SR2PC_CHAN(IDB_QUEUE),
+		priv->regs + TSI721_DEV_CHAN_INTE);
+
+	/* Enable SRIO MAC interrupts */
+	iowrite32(TSI721_RIO_EM_DEV_INT_EN_INT,
+		priv->regs + TSI721_RIO_EM_DEV_INT_EN);
+
+	if (priv->flags & TSI721_USING_MSIX)
+		intr = TSI721_DEV_INT_SRIO;
+	else
+		intr = TSI721_DEV_INT_SR2PC_CH | TSI721_DEV_INT_SRIO |
+			TSI721_DEV_INT_SMSG_CH;
+
+	iowrite32(intr, priv->regs + TSI721_DEV_INTE);
+	ioread32(priv->regs + TSI721_DEV_INTE);
+}
+
+#ifdef CONFIG_PCI_MSI
+/**
+ * tsi721_omsg_msix - MSI-X interrupt handler for outbound messaging
+ * @irq: Linux interrupt number
+ * @ptr: Pointer to interrupt-specific data (mport structure)
+ *
+ * Handles outbound messaging interrupts signaled using MSI-X.
+ */
+static irqreturn_t tsi721_omsg_msix(int irq, void *ptr)
+{
+	struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+	int mbox;
+
+	mbox = (irq - priv->msix[TSI721_VECT_OMB0_DONE].vector) % RIO_MAX_MBOX;
+	tsi721_omsg_handler(priv, mbox);
+	return IRQ_HANDLED;
+}
+
+/**
+ * tsi721_imsg_msix - MSI-X interrupt handler for inbound messaging
+ * @irq: Linux interrupt number
+ * @ptr: Pointer to interrupt-specific data (mport structure)
+ *
+ * Handles inbound messaging interrupts signaled using MSI-X.
+ */
+static irqreturn_t tsi721_imsg_msix(int irq, void *ptr)
+{
+	struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+	int mbox;
+
+	mbox = (irq - priv->msix[TSI721_VECT_IMB0_RCV].vector) % RIO_MAX_MBOX;
+	tsi721_imsg_handler(priv, mbox + 4);
+	return IRQ_HANDLED;
+}
+
+/**
+ * tsi721_srio_msix - Tsi721 MSI-X SRIO MAC interrupt handler
+ * @irq: Linux interrupt number
+ * @ptr: Pointer to interrupt-specific data (mport structure)
+ *
+ * Handles Tsi721 interrupts from SRIO MAC.
+ */
+static irqreturn_t tsi721_srio_msix(int irq, void *ptr)
+{
+	struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+	u32 srio_int;
+
+	/* Service SRIO MAC interrupts */
+	srio_int = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT);
+	if (srio_int & TSI721_RIO_EM_INT_STAT_PW_RX)
+		tsi721_pw_handler((struct rio_mport *)ptr);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * tsi721_sr2pc_ch_msix - Tsi721 MSI-X SR2PC Channel interrupt handler
+ * @irq: Linux interrupt number
+ * @ptr: Pointer to interrupt-specific data (mport structure)
+ *
+ * Handles Tsi721 interrupts from SR2PC Channel.
+ * NOTE: At this moment services only one SR2PC channel associated with inbound
+ * doorbells.
+ */
+static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr)
+{
+	struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv;
+	u32 sr_ch_int;
+
+	/* Service Inbound DB interrupt from SR2PC channel */
+	sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+	if (sr_ch_int & TSI721_SR_CHINT_IDBQRCV)
+		tsi721_dbell_handler((struct rio_mport *)ptr);
+
+	/* Clear interrupts */
+	iowrite32(sr_ch_int, priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+	/* Read back to ensure that interrupt was cleared */
+	sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE));
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * tsi721_request_msix - register interrupt service for MSI-X mode.
+ * @mport: RapidIO master port structure
+ *
+ * Registers MSI-X interrupt service routines for interrupts that are active
+ * immediately after mport initialization. Messaging interrupt service routines
+ * should be registered during corresponding open requests.
+ */
+static int tsi721_request_msix(struct rio_mport *mport)
+{
+	struct tsi721_device *priv = mport->priv;
+	int err = 0;
+
+	err = request_irq(priv->msix[TSI721_VECT_IDB].vector,
+			tsi721_sr2pc_ch_msix, 0,
+			priv->msix[TSI721_VECT_IDB].irq_name, (void *)mport);
+	if (err)
+		goto out;
+
+	err = request_irq(priv->msix[TSI721_VECT_PWRX].vector,
+			tsi721_srio_msix, 0,
+			priv->msix[TSI721_VECT_PWRX].irq_name, (void *)mport);
+	if (err)
+		free_irq(
+			priv->msix[TSI721_VECT_IDB].vector,
+			(void *)mport);
+out:
+	return err;
+}
+
+/**
+ * tsi721_enable_msix - Attempts to enable MSI-X support for Tsi721.
+ * @priv: pointer to tsi721 private data
+ *
+ * Configures MSI-X support for Tsi721. Supports only an exact number
+ * of requested vectors.
+ */
+static int tsi721_enable_msix(struct tsi721_device *priv)
+{
+	struct msix_entry entries[TSI721_VECT_MAX];
+	int err;
+	int i;
+
+	entries[TSI721_VECT_IDB].entry = TSI721_MSIX_SR2PC_IDBQ_RCV(IDB_QUEUE);
+	entries[TSI721_VECT_PWRX].entry = TSI721_MSIX_SRIO_MAC_INT;
+
+	/*
+	 * Initialize MSI-X entries for Messaging Engine:
+	 * this driver supports four RIO mailboxes (inbound and outbound)
+	 * NOTE: Inbound message MBOX 0...4 use IB channels 4...7. Therefore
+	 * offset +4 is added to IB MBOX number.
+	 */
+	for (i = 0; i < RIO_MAX_MBOX; i++) {
+		entries[TSI721_VECT_IMB0_RCV + i].entry =
+					TSI721_MSIX_IMSG_DQ_RCV(i + 4);
+		entries[TSI721_VECT_IMB0_INT + i].entry =
+					TSI721_MSIX_IMSG_INT(i + 4);
+		entries[TSI721_VECT_OMB0_DONE + i].entry =
+					TSI721_MSIX_OMSG_DONE(i);
+		entries[TSI721_VECT_OMB0_INT + i].entry =
+					TSI721_MSIX_OMSG_INT(i);
+	}
+
+	err = pci_enable_msix(priv->pdev, entries, ARRAY_SIZE(entries));
+	if (err) {
+		if (err > 0)
+			dev_info(&priv->pdev->dev,
+				 "Only %d MSI-X vectors available, "
+				 "not using MSI-X\n", err);
+		return err;
+	}
+
+	/*
+	 * Copy MSI-X vector information into tsi721 private structure
+	 */
+	priv->msix[TSI721_VECT_IDB].vector = entries[TSI721_VECT_IDB].vector;
+	snprintf(priv->msix[TSI721_VECT_IDB].irq_name, IRQ_DEVICE_NAME_MAX,
+		 DRV_NAME "-idb@pci:%s", pci_name(priv->pdev));
+	priv->msix[TSI721_VECT_PWRX].vector = entries[TSI721_VECT_PWRX].vector;
+	snprintf(priv->msix[TSI721_VECT_PWRX].irq_name, IRQ_DEVICE_NAME_MAX,
+		 DRV_NAME "-pwrx@pci:%s", pci_name(priv->pdev));
+
+	for (i = 0; i < RIO_MAX_MBOX; i++) {
+		priv->msix[TSI721_VECT_IMB0_RCV + i].vector =
+				entries[TSI721_VECT_IMB0_RCV + i].vector;
+		snprintf(priv->msix[TSI721_VECT_IMB0_RCV + i].irq_name,
+			 IRQ_DEVICE_NAME_MAX, DRV_NAME "-imbr%d@pci:%s",
+			 i, pci_name(priv->pdev));
+
+		priv->msix[TSI721_VECT_IMB0_INT + i].vector =
+				entries[TSI721_VECT_IMB0_INT + i].vector;
+		snprintf(priv->msix[TSI721_VECT_IMB0_INT + i].irq_name,
+			 IRQ_DEVICE_NAME_MAX, DRV_NAME "-imbi%d@pci:%s",
+			 i, pci_name(priv->pdev));
+
+		priv->msix[TSI721_VECT_OMB0_DONE + i].vector =
+				entries[TSI721_VECT_OMB0_DONE + i].vector;
+		snprintf(priv->msix[TSI721_VECT_OMB0_DONE + i].irq_name,
+			 IRQ_DEVICE_NAME_MAX, DRV_NAME "-ombd%d@pci:%s",
+			 i, pci_name(priv->pdev));
+
+		priv->msix[TSI721_VECT_OMB0_INT + i].vector =
+				entries[TSI721_VECT_OMB0_INT + i].vector;
+		snprintf(priv->msix[TSI721_VECT_OMB0_INT + i].irq_name,
+			 IRQ_DEVICE_NAME_MAX, DRV_NAME "-ombi%d@pci:%s",
+			 i, pci_name(priv->pdev));
+	}
+
+	return 0;
+}
+#endif /* CONFIG_PCI_MSI */
+
+static int tsi721_request_irq(struct rio_mport *mport)
+{
+	struct tsi721_device *priv = mport->priv;
+	int err;
+
+#ifdef CONFIG_PCI_MSI
+	if (priv->flags & TSI721_USING_MSIX)
+		err = tsi721_request_msix(mport);
+	else
+#endif
+		err = request_irq(priv->pdev->irq, tsi721_irqhandler,
+			  (priv->flags & TSI721_USING_MSI) ? 0 : IRQF_SHARED,
+			  DRV_NAME, (void *)mport);
+
+	if (err)
+		dev_err(&priv->pdev->dev,
+			"Unable to allocate interrupt, Error: %d\n", err);
+
+	return err;
+}
+
+/**
+ * tsi721_init_pc2sr_mapping - initializes outbound (PCIe->SRIO)
+ * translation regions.
+ * @priv: pointer to tsi721 private data
+ *
+ * Disables SREP translation regions.
+ */
+static void tsi721_init_pc2sr_mapping(struct tsi721_device *priv)
+{
+	int i;
+
+	/* Disable all PC2SR translation windows */
+	for (i = 0; i < TSI721_OBWIN_NUM; i++)
+		iowrite32(0, priv->regs + TSI721_OBWINLB(i));
+}
+
+/**
+ * tsi721_init_sr2pc_mapping - initializes inbound (SRIO->PCIe)
+ * translation regions.
+ * @priv: pointer to tsi721 private data
+ *
+ * Disables inbound windows.
+ */
+static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv)
+{
+	int i;
+
+	/* Disable all SR2PC inbound windows */
+	for (i = 0; i < TSI721_IBWIN_NUM; i++)
+		iowrite32(0, priv->regs + TSI721_IBWINLB(i));
+}
+
+/**
+ * tsi721_port_write_init - Inbound port write interface init
+ * @priv: pointer to tsi721 private data
+ *
+ * Initializes inbound port write handler.
+ * Returns %0 on success or %-ENOMEM on failure.
+ */
+static int tsi721_port_write_init(struct tsi721_device *priv)
+{
+	priv->pw_discard_count = 0;
+	INIT_WORK(&priv->pw_work, tsi721_pw_dpc);
+	spin_lock_init(&priv->pw_fifo_lock);
+	if (kfifo_alloc(&priv->pw_fifo,
+			TSI721_RIO_PW_MSG_SIZE * 32, GFP_KERNEL)) {
+		dev_err(&priv->pdev->dev, "PW FIFO allocation failed\n");
+		return -ENOMEM;
+	}
+
+	/* Use reliable port-write capture mode */
+	iowrite32(TSI721_RIO_PW_CTL_PWC_REL, priv->regs + TSI721_RIO_PW_CTL);
+	return 0;
+}
+
+static int tsi721_doorbell_init(struct tsi721_device *priv)
+{
+	/* Outbound Doorbells do not require any setup.
+	 * Tsi721 uses dedicated PCI BAR1 to generate doorbells.
+	 * That BAR1 was mapped during the probe routine.
+	 */
+
+	/* Initialize Inbound Doorbell processing DPC and queue */
+	priv->db_discard_count = 0;
+	INIT_WORK(&priv->idb_work, tsi721_db_dpc);
+
+	/* Allocate buffer for inbound doorbells queue */
+	priv->idb_base = dma_alloc_coherent(&priv->pdev->dev,
+				IDB_QSIZE * TSI721_IDB_ENTRY_SIZE,
+				&priv->idb_dma, GFP_KERNEL);
+	if (!priv->idb_base)
+		return -ENOMEM;
+
+	memset(priv->idb_base, 0, IDB_QSIZE * TSI721_IDB_ENTRY_SIZE);
+
+	dev_dbg(&priv->pdev->dev, "Allocated IDB buffer @ %p (phys = %llx)\n",
+		priv->idb_base, (unsigned long long)priv->idb_dma);
+
+	iowrite32(TSI721_IDQ_SIZE_VAL(IDB_QSIZE),
+		priv->regs + TSI721_IDQ_SIZE(IDB_QUEUE));
+	iowrite32(((u64)priv->idb_dma >> 32),
+		priv->regs + TSI721_IDQ_BASEU(IDB_QUEUE));
+	iowrite32(((u64)priv->idb_dma & TSI721_IDQ_BASEL_ADDR),
+		priv->regs + TSI721_IDQ_BASEL(IDB_QUEUE));
+	/* Enable accepting all inbound doorbells */
+	iowrite32(0, priv->regs + TSI721_IDQ_MASK(IDB_QUEUE));
+
+	iowrite32(TSI721_IDQ_INIT, priv->regs + TSI721_IDQ_CTL(IDB_QUEUE));
+
+	iowrite32(0, priv->regs + TSI721_IDQ_RP(IDB_QUEUE));
+
+	return 0;
+}
+
+static void tsi721_doorbell_free(struct tsi721_device *priv)
+{
+	if (priv->idb_base == NULL)
+		return;
+
+	/* Free buffer allocated for inbound doorbell queue */
+	dma_free_coherent(&priv->pdev->dev, IDB_QSIZE * TSI721_IDB_ENTRY_SIZE,
+			  priv->idb_base, priv->idb_dma);
+	priv->idb_base = NULL;
+}
+
+static int tsi721_bdma_ch_init(struct tsi721_device *priv, int chnum)
+{
+	struct tsi721_dma_desc *bd_ptr;
+	u64		*sts_ptr;
+	dma_addr_t	bd_phys, sts_phys;
+	int		sts_size;
+	int		bd_num = priv->bdma[chnum].bd_num;
+
+	dev_dbg(&priv->pdev->dev, "Init Block DMA Engine, CH%d\n", chnum);
+
+	/*
+	 * Initialize DMA channel for maintenance requests
+	 */
+
+	/* Allocate space for DMA descriptors */
+	bd_ptr = dma_alloc_coherent(&priv->pdev->dev,
+					bd_num * sizeof(struct tsi721_dma_desc),
+					&bd_phys, GFP_KERNEL);
+	if (!bd_ptr)
+		return -ENOMEM;
+
+	priv->bdma[chnum].bd_phys = bd_phys;
+	priv->bdma[chnum].bd_base = bd_ptr;
+
+	memset(bd_ptr, 0, bd_num * sizeof(struct tsi721_dma_desc));
+
+	dev_dbg(&priv->pdev->dev, "DMA descriptors @ %p (phys = %llx)\n",
+		bd_ptr, (unsigned long long)bd_phys);
+
+	/* Allocate space for descriptor status FIFO */
+	sts_size = (bd_num >= TSI721_DMA_MINSTSSZ) ?
+					bd_num : TSI721_DMA_MINSTSSZ;
+	sts_size = roundup_pow_of_two(sts_size);
+	sts_ptr = dma_alloc_coherent(&priv->pdev->dev,
+				     sts_size * sizeof(struct tsi721_dma_sts),
+				     &sts_phys, GFP_KERNEL);
+	if (!sts_ptr) {
+		/* Free space allocated for DMA descriptors */
+		dma_free_coherent(&priv->pdev->dev,
+				  bd_num * sizeof(struct tsi721_dma_desc),
+				  bd_ptr, bd_phys);
+		priv->bdma[chnum].bd_base = NULL;
+		return -ENOMEM;
+	}
+
+	priv->bdma[chnum].sts_phys = sts_phys;
+	priv->bdma[chnum].sts_base = sts_ptr;
+	priv->bdma[chnum].sts_size = sts_size;
+
+	memset(sts_ptr, 0, sts_size);
+
+	dev_dbg(&priv->pdev->dev,
+		"desc status FIFO @ %p (phys = %llx) size=0x%x\n",
+		sts_ptr, (unsigned long long)sts_phys, sts_size);
+
+	/* Initialize DMA descriptors ring */
+	bd_ptr[bd_num - 1].type_id = cpu_to_le32(DTYPE3 << 29);
+	bd_ptr[bd_num - 1].next_lo = cpu_to_le32((u64)bd_phys &
+						 TSI721_DMAC_DPTRL_MASK);
+	bd_ptr[bd_num - 1].next_hi = cpu_to_le32((u64)bd_phys >> 32);
+
+	/* Setup DMA descriptor pointers */
+	iowrite32(((u64)bd_phys >> 32),
+		priv->regs + TSI721_DMAC_DPTRH(chnum));
+	iowrite32(((u64)bd_phys & TSI721_DMAC_DPTRL_MASK),
+		priv->regs + TSI721_DMAC_DPTRL(chnum));
+
+	/* Setup descriptor status FIFO */
+	iowrite32(((u64)sts_phys >> 32),
+		priv->regs + TSI721_DMAC_DSBH(chnum));
+	iowrite32(((u64)sts_phys & TSI721_DMAC_DSBL_MASK),
+		priv->regs + TSI721_DMAC_DSBL(chnum));
+	iowrite32(TSI721_DMAC_DSSZ_SIZE(sts_size),
+		priv->regs + TSI721_DMAC_DSSZ(chnum));
+
+	/* Clear interrupt bits */
+	iowrite32(TSI721_DMAC_INT_ALL,
+		priv->regs + TSI721_DMAC_INT(chnum));
+
+	ioread32(priv->regs + TSI721_DMAC_INT(chnum));
+
+	/* Toggle DMA channel initialization */
+	iowrite32(TSI721_DMAC_CTL_INIT,	priv->regs + TSI721_DMAC_CTL(chnum));
+	ioread32(priv->regs + TSI721_DMAC_CTL(chnum));
+	udelay(10);
+
+	return 0;
+}
+
+static int tsi721_bdma_ch_free(struct tsi721_device *priv, int chnum)
+{
+	u32 ch_stat;
+
+	if (priv->bdma[chnum].bd_base == NULL)
+		return 0;
+
+	/* Check if DMA channel still running */
+	ch_stat = ioread32(priv->regs +	TSI721_DMAC_STS(chnum));
+	if (ch_stat & TSI721_DMAC_STS_RUN)
+		return -EFAULT;
+
+	/* Put DMA channel into init state */
+	iowrite32(TSI721_DMAC_CTL_INIT,
+		priv->regs + TSI721_DMAC_CTL(chnum));
+
+	/* Free space allocated for DMA descriptors */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->bdma[chnum].bd_num * sizeof(struct tsi721_dma_desc),
+		priv->bdma[chnum].bd_base, priv->bdma[chnum].bd_phys);
+	priv->bdma[chnum].bd_base = NULL;
+
+	/* Free space allocated for status FIFO */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->bdma[chnum].sts_size * sizeof(struct tsi721_dma_sts),
+		priv->bdma[chnum].sts_base, priv->bdma[chnum].sts_phys);
+	priv->bdma[chnum].sts_base = NULL;
+	return 0;
+}
+
+static int tsi721_bdma_init(struct tsi721_device *priv)
+{
+	/* Initialize BDMA channel allocated for RapidIO maintenance read/write
+	 * request generation
+	 */
+	priv->bdma[TSI721_DMACH_MAINT].bd_num = 2;
+	if (tsi721_bdma_ch_init(priv, TSI721_DMACH_MAINT)) {
+		dev_err(&priv->pdev->dev, "Unable to initialize maintenance DMA"
+			" channel %d, aborting\n", TSI721_DMACH_MAINT);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void tsi721_bdma_free(struct tsi721_device *priv)
+{
+	tsi721_bdma_ch_free(priv, TSI721_DMACH_MAINT);
+}
+
+/* Enable Inbound Messaging Interrupts */
+static void
+tsi721_imsg_interrupt_enable(struct tsi721_device *priv, int ch,
+				  u32 inte_mask)
+{
+	u32 rval;
+
+	if (!inte_mask)
+		return;
+
+	/* Clear pending Inbound Messaging interrupts */
+	iowrite32(inte_mask, priv->regs + TSI721_IBDMAC_INT(ch));
+
+	/* Enable Inbound Messaging interrupts */
+	rval = ioread32(priv->regs + TSI721_IBDMAC_INTE(ch));
+	iowrite32(rval | inte_mask, priv->regs + TSI721_IBDMAC_INTE(ch));
+
+	if (priv->flags & TSI721_USING_MSIX)
+		return; /* Finished if we are in MSI-X mode */
+
+	/*
+	 * For MSI and INTA interrupt signalling we need to enable next levels
+	 */
+
+	/* Enable Device Channel Interrupt */
+	rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+	iowrite32(rval | TSI721_INT_IMSG_CHAN(ch),
+		  priv->regs + TSI721_DEV_CHAN_INTE);
+}
+
+/* Disable Inbound Messaging Interrupts */
+static void
+tsi721_imsg_interrupt_disable(struct tsi721_device *priv, int ch,
+				   u32 inte_mask)
+{
+	u32 rval;
+
+	if (!inte_mask)
+		return;
+
+	/* Clear pending Inbound Messaging interrupts */
+	iowrite32(inte_mask, priv->regs + TSI721_IBDMAC_INT(ch));
+
+	/* Disable Inbound Messaging interrupts */
+	rval = ioread32(priv->regs + TSI721_IBDMAC_INTE(ch));
+	rval &= ~inte_mask;
+	iowrite32(rval, priv->regs + TSI721_IBDMAC_INTE(ch));
+
+	if (priv->flags & TSI721_USING_MSIX)
+		return; /* Finished if we are in MSI-X mode */
+
+	/*
+	 * For MSI and INTA interrupt signalling we need to disable next levels
+	 */
+
+	/* Disable Device Channel Interrupt */
+	rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+	rval &= ~TSI721_INT_IMSG_CHAN(ch);
+	iowrite32(rval, priv->regs + TSI721_DEV_CHAN_INTE);
+}
+
+/* Enable Outbound Messaging interrupts */
+static void
+tsi721_omsg_interrupt_enable(struct tsi721_device *priv, int ch,
+				  u32 inte_mask)
+{
+	u32 rval;
+
+	if (!inte_mask)
+		return;
+
+	/* Clear pending Outbound Messaging interrupts */
+	iowrite32(inte_mask, priv->regs + TSI721_OBDMAC_INT(ch));
+
+	/* Enable Outbound Messaging channel interrupts */
+	rval = ioread32(priv->regs + TSI721_OBDMAC_INTE(ch));
+	iowrite32(rval | inte_mask, priv->regs + TSI721_OBDMAC_INTE(ch));
+
+	if (priv->flags & TSI721_USING_MSIX)
+		return; /* Finished if we are in MSI-X mode */
+
+	/*
+	 * For MSI and INTA interrupt signalling we need to enable next levels
+	 */
+
+	/* Enable Device Channel Interrupt */
+	rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+	iowrite32(rval | TSI721_INT_OMSG_CHAN(ch),
+		  priv->regs + TSI721_DEV_CHAN_INTE);
+}
+
+/* Disable Outbound Messaging interrupts */
+static void
+tsi721_omsg_interrupt_disable(struct tsi721_device *priv, int ch,
+				   u32 inte_mask)
+{
+	u32 rval;
+
+	if (!inte_mask)
+		return;
+
+	/* Clear pending Outbound Messaging interrupts */
+	iowrite32(inte_mask, priv->regs + TSI721_OBDMAC_INT(ch));
+
+	/* Disable Outbound Messaging interrupts */
+	rval = ioread32(priv->regs + TSI721_OBDMAC_INTE(ch));
+	rval &= ~inte_mask;
+	iowrite32(rval, priv->regs + TSI721_OBDMAC_INTE(ch));
+
+	if (priv->flags & TSI721_USING_MSIX)
+		return; /* Finished if we are in MSI-X mode */
+
+	/*
+	 * For MSI and INTA interrupt signalling we need to disable next levels
+	 */
+
+	/* Disable Device Channel Interrupt */
+	rval = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+	rval &= ~TSI721_INT_OMSG_CHAN(ch);
+	iowrite32(rval, priv->regs + TSI721_DEV_CHAN_INTE);
+}
+
+/**
+ * tsi721_add_outb_message - Add message to the Tsi721 outbound message queue
+ * @mport: Master port with outbound message queue
+ * @rdev: Target of outbound message
+ * @mbox: Outbound mailbox
+ * @buffer: Message to add to outbound queue
+ * @len: Length of message
+ */
+static int
+tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox,
+			void *buffer, size_t len)
+{
+	struct tsi721_device *priv = mport->priv;
+	struct tsi721_omsg_desc *desc;
+	u32 tx_slot;
+
+	if (!priv->omsg_init[mbox] ||
+	    len > TSI721_MSG_MAX_SIZE || len < 8)
+		return -EINVAL;
+
+	tx_slot = priv->omsg_ring[mbox].tx_slot;
+
+	/* Copy copy message into transfer buffer */
+	memcpy(priv->omsg_ring[mbox].omq_base[tx_slot], buffer, len);
+
+	if (len & 0x7)
+		len += 8;
+
+	/* Build descriptor associated with buffer */
+	desc = priv->omsg_ring[mbox].omd_base;
+	desc[tx_slot].type_id = cpu_to_le32((DTYPE4 << 29) | rdev->destid);
+	if (tx_slot % 4 == 0)
+		desc[tx_slot].type_id |= cpu_to_le32(TSI721_OMD_IOF);
+
+	desc[tx_slot].msg_info =
+		cpu_to_le32((mport->sys_size << 26) | (mbox << 22) |
+			    (0xe << 12) | (len & 0xff8));
+	desc[tx_slot].bufptr_lo =
+		cpu_to_le32((u64)priv->omsg_ring[mbox].omq_phys[tx_slot] &
+			    0xffffffff);
+	desc[tx_slot].bufptr_hi =
+		cpu_to_le32((u64)priv->omsg_ring[mbox].omq_phys[tx_slot] >> 32);
+
+	priv->omsg_ring[mbox].wr_count++;
+
+	/* Go to next descriptor */
+	if (++priv->omsg_ring[mbox].tx_slot == priv->omsg_ring[mbox].size) {
+		priv->omsg_ring[mbox].tx_slot = 0;
+		/* Move through the ring link descriptor at the end */
+		priv->omsg_ring[mbox].wr_count++;
+	}
+
+	mb();
+
+	/* Set new write count value */
+	iowrite32(priv->omsg_ring[mbox].wr_count,
+		priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
+	ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
+
+	return 0;
+}
+
+/**
+ * tsi721_omsg_handler - Outbound Message Interrupt Handler
+ * @priv: pointer to tsi721 private data
+ * @ch:   number of OB MSG channel to service
+ *
+ * Services channel interrupts from outbound messaging engine.
+ */
+static void tsi721_omsg_handler(struct tsi721_device *priv, int ch)
+{
+	u32 omsg_int;
+
+	spin_lock(&priv->omsg_ring[ch].lock);
+
+	omsg_int = ioread32(priv->regs + TSI721_OBDMAC_INT(ch));
+
+	if (omsg_int & TSI721_OBDMAC_INT_ST_FULL)
+		dev_info(&priv->pdev->dev,
+			"OB MBOX%d: Status FIFO is full\n", ch);
+
+	if (omsg_int & (TSI721_OBDMAC_INT_DONE | TSI721_OBDMAC_INT_IOF_DONE)) {
+		u32 srd_ptr;
+		u64 *sts_ptr, last_ptr = 0, prev_ptr = 0;
+		int i, j;
+		u32 tx_slot;
+
+		/*
+		 * Find last successfully processed descriptor
+		 */
+
+		/* Check and clear descriptor status FIFO entries */
+		srd_ptr = priv->omsg_ring[ch].sts_rdptr;
+		sts_ptr = priv->omsg_ring[ch].sts_base;
+		j = srd_ptr * 8;
+		while (sts_ptr[j]) {
+			for (i = 0; i < 8 && sts_ptr[j]; i++, j++) {
+				prev_ptr = last_ptr;
+				last_ptr = le64_to_cpu(sts_ptr[j]);
+				sts_ptr[j] = 0;
+			}
+
+			++srd_ptr;
+			srd_ptr %= priv->omsg_ring[ch].sts_size;
+			j = srd_ptr * 8;
+		}
+
+		if (last_ptr == 0)
+			goto no_sts_update;
+
+		priv->omsg_ring[ch].sts_rdptr = srd_ptr;
+		iowrite32(srd_ptr, priv->regs + TSI721_OBDMAC_DSRP(ch));
+
+		if (!priv->mport->outb_msg[ch].mcback)
+			goto no_sts_update;
+
+		/* Inform upper layer about transfer completion */
+
+		tx_slot = (last_ptr - (u64)priv->omsg_ring[ch].omd_phys)/
+						sizeof(struct tsi721_omsg_desc);
+
+		/*
+		 * Check if this is a Link Descriptor (LD).
+		 * If yes, ignore LD and use descriptor processed
+		 * before LD.
+		 */
+		if (tx_slot == priv->omsg_ring[ch].size) {
+			if (prev_ptr)
+				tx_slot = (prev_ptr -
+					(u64)priv->omsg_ring[ch].omd_phys)/
+						sizeof(struct tsi721_omsg_desc);
+			else
+				goto no_sts_update;
+		}
+
+		/* Move slot index to the next message to be sent */
+		++tx_slot;
+		if (tx_slot == priv->omsg_ring[ch].size)
+			tx_slot = 0;
+		BUG_ON(tx_slot >= priv->omsg_ring[ch].size);
+		priv->mport->outb_msg[ch].mcback(priv->mport,
+				priv->omsg_ring[ch].dev_id, ch,
+				tx_slot);
+	}
+
+no_sts_update:
+
+	if (omsg_int & TSI721_OBDMAC_INT_ERROR) {
+		/*
+		* Outbound message operation aborted due to error,
+		* reinitialize OB MSG channel
+		*/
+
+		dev_dbg(&priv->pdev->dev, "OB MSG ABORT ch_stat=%x\n",
+			ioread32(priv->regs + TSI721_OBDMAC_STS(ch)));
+
+		iowrite32(TSI721_OBDMAC_INT_ERROR,
+				priv->regs + TSI721_OBDMAC_INT(ch));
+		iowrite32(TSI721_OBDMAC_CTL_INIT,
+				priv->regs + TSI721_OBDMAC_CTL(ch));
+		ioread32(priv->regs + TSI721_OBDMAC_CTL(ch));
+
+		/* Inform upper level to clear all pending tx slots */
+		if (priv->mport->outb_msg[ch].mcback)
+			priv->mport->outb_msg[ch].mcback(priv->mport,
+					priv->omsg_ring[ch].dev_id, ch,
+					priv->omsg_ring[ch].tx_slot);
+		/* Synch tx_slot tracking */
+		iowrite32(priv->omsg_ring[ch].tx_slot,
+			priv->regs + TSI721_OBDMAC_DRDCNT(ch));
+		ioread32(priv->regs + TSI721_OBDMAC_DRDCNT(ch));
+		priv->omsg_ring[ch].wr_count = priv->omsg_ring[ch].tx_slot;
+		priv->omsg_ring[ch].sts_rdptr = 0;
+	}
+
+	/* Clear channel interrupts */
+	iowrite32(omsg_int, priv->regs + TSI721_OBDMAC_INT(ch));
+
+	if (!(priv->flags & TSI721_USING_MSIX)) {
+		u32 ch_inte;
+
+		/* Re-enable channel interrupts */
+		ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+		ch_inte |= TSI721_INT_OMSG_CHAN(ch);
+		iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE);
+	}
+
+	spin_unlock(&priv->omsg_ring[ch].lock);
+}
+
+/**
+ * tsi721_open_outb_mbox - Initialize Tsi721 outbound mailbox
+ * @mport: Master port implementing Outbound Messaging Engine
+ * @dev_id: Device specific pointer to pass on event
+ * @mbox: Mailbox to open
+ * @entries: Number of entries in the outbound mailbox ring
+ */
+static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id,
+				 int mbox, int entries)
+{
+	struct tsi721_device *priv = mport->priv;
+	struct tsi721_omsg_desc *bd_ptr;
+	int i, rc = 0;
+
+	if ((entries < TSI721_OMSGD_MIN_RING_SIZE) ||
+	    (entries > (TSI721_OMSGD_RING_SIZE)) ||
+	    (!is_power_of_2(entries)) || mbox >= RIO_MAX_MBOX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	priv->omsg_ring[mbox].dev_id = dev_id;
+	priv->omsg_ring[mbox].size = entries;
+	priv->omsg_ring[mbox].sts_rdptr = 0;
+	spin_lock_init(&priv->omsg_ring[mbox].lock);
+
+	/* Outbound Msg Buffer allocation based on
+	   the number of maximum descriptor entries */
+	for (i = 0; i < entries; i++) {
+		priv->omsg_ring[mbox].omq_base[i] =
+			dma_alloc_coherent(
+				&priv->pdev->dev, TSI721_MSG_BUFFER_SIZE,
+				&priv->omsg_ring[mbox].omq_phys[i],
+				GFP_KERNEL);
+		if (priv->omsg_ring[mbox].omq_base[i] == NULL) {
+			dev_dbg(&priv->pdev->dev,
+				"Unable to allocate OB MSG data buffer for"
+				" MBOX%d\n", mbox);
+			rc = -ENOMEM;
+			goto out_buf;
+		}
+	}
+
+	/* Outbound message descriptor allocation */
+	priv->omsg_ring[mbox].omd_base = dma_alloc_coherent(
+				&priv->pdev->dev,
+				(entries + 1) * sizeof(struct tsi721_omsg_desc),
+				&priv->omsg_ring[mbox].omd_phys, GFP_KERNEL);
+	if (priv->omsg_ring[mbox].omd_base == NULL) {
+		dev_dbg(&priv->pdev->dev,
+			"Unable to allocate OB MSG descriptor memory "
+			"for MBOX%d\n", mbox);
+		rc = -ENOMEM;
+		goto out_buf;
+	}
+
+	priv->omsg_ring[mbox].tx_slot = 0;
+
+	/* Outbound message descriptor status FIFO allocation */
+	priv->omsg_ring[mbox].sts_size = roundup_pow_of_two(entries + 1);
+	priv->omsg_ring[mbox].sts_base = dma_alloc_coherent(&priv->pdev->dev,
+			priv->omsg_ring[mbox].sts_size *
+						sizeof(struct tsi721_dma_sts),
+			&priv->omsg_ring[mbox].sts_phys, GFP_KERNEL);
+	if (priv->omsg_ring[mbox].sts_base == NULL) {
+		dev_dbg(&priv->pdev->dev,
+			"Unable to allocate OB MSG descriptor status FIFO "
+			"for MBOX%d\n", mbox);
+		rc = -ENOMEM;
+		goto out_desc;
+	}
+
+	memset(priv->omsg_ring[mbox].sts_base, 0,
+		entries * sizeof(struct tsi721_dma_sts));
+
+	/*
+	 * Configure Outbound Messaging Engine
+	 */
+
+	/* Setup Outbound Message descriptor pointer */
+	iowrite32(((u64)priv->omsg_ring[mbox].omd_phys >> 32),
+			priv->regs + TSI721_OBDMAC_DPTRH(mbox));
+	iowrite32(((u64)priv->omsg_ring[mbox].omd_phys &
+					TSI721_OBDMAC_DPTRL_MASK),
+			priv->regs + TSI721_OBDMAC_DPTRL(mbox));
+
+	/* Setup Outbound Message descriptor status FIFO */
+	iowrite32(((u64)priv->omsg_ring[mbox].sts_phys >> 32),
+			priv->regs + TSI721_OBDMAC_DSBH(mbox));
+	iowrite32(((u64)priv->omsg_ring[mbox].sts_phys &
+					TSI721_OBDMAC_DSBL_MASK),
+			priv->regs + TSI721_OBDMAC_DSBL(mbox));
+	iowrite32(TSI721_DMAC_DSSZ_SIZE(priv->omsg_ring[mbox].sts_size),
+		priv->regs + (u32)TSI721_OBDMAC_DSSZ(mbox));
+
+	/* Enable interrupts */
+
+#ifdef CONFIG_PCI_MSI
+	if (priv->flags & TSI721_USING_MSIX) {
+		/* Request interrupt service if we are in MSI-X mode */
+		rc = request_irq(
+			priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
+			tsi721_omsg_msix, 0,
+			priv->msix[TSI721_VECT_OMB0_DONE + mbox].irq_name,
+			(void *)mport);
+
+		if (rc) {
+			dev_dbg(&priv->pdev->dev,
+				"Unable to allocate MSI-X interrupt for "
+				"OBOX%d-DONE\n", mbox);
+			goto out_stat;
+		}
+
+		rc = request_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector,
+			tsi721_omsg_msix, 0,
+			priv->msix[TSI721_VECT_OMB0_INT + mbox].irq_name,
+			(void *)mport);
+
+		if (rc)	{
+			dev_dbg(&priv->pdev->dev,
+				"Unable to allocate MSI-X interrupt for "
+				"MBOX%d-INT\n", mbox);
+			free_irq(
+				priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
+				(void *)mport);
+			goto out_stat;
+		}
+	}
+#endif /* CONFIG_PCI_MSI */
+
+	tsi721_omsg_interrupt_enable(priv, mbox, TSI721_OBDMAC_INT_ALL);
+
+	/* Initialize Outbound Message descriptors ring */
+	bd_ptr = priv->omsg_ring[mbox].omd_base;
+	bd_ptr[entries].type_id = cpu_to_le32(DTYPE5 << 29);
+	bd_ptr[entries].msg_info = 0;
+	bd_ptr[entries].next_lo =
+		cpu_to_le32((u64)priv->omsg_ring[mbox].omd_phys &
+		TSI721_OBDMAC_DPTRL_MASK);
+	bd_ptr[entries].next_hi =
+		cpu_to_le32((u64)priv->omsg_ring[mbox].omd_phys >> 32);
+	priv->omsg_ring[mbox].wr_count = 0;
+	mb();
+
+	/* Initialize Outbound Message engine */
+	iowrite32(TSI721_OBDMAC_CTL_INIT, priv->regs + TSI721_OBDMAC_CTL(mbox));
+	ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox));
+	udelay(10);
+
+	priv->omsg_init[mbox] = 1;
+
+	return 0;
+
+#ifdef CONFIG_PCI_MSI
+out_stat:
+	dma_free_coherent(&priv->pdev->dev,
+		priv->omsg_ring[mbox].sts_size * sizeof(struct tsi721_dma_sts),
+		priv->omsg_ring[mbox].sts_base,
+		priv->omsg_ring[mbox].sts_phys);
+
+	priv->omsg_ring[mbox].sts_base = NULL;
+#endif /* CONFIG_PCI_MSI */
+
+out_desc:
+	dma_free_coherent(&priv->pdev->dev,
+		(entries + 1) * sizeof(struct tsi721_omsg_desc),
+		priv->omsg_ring[mbox].omd_base,
+		priv->omsg_ring[mbox].omd_phys);
+
+	priv->omsg_ring[mbox].omd_base = NULL;
+
+out_buf:
+	for (i = 0; i < priv->omsg_ring[mbox].size; i++) {
+		if (priv->omsg_ring[mbox].omq_base[i]) {
+			dma_free_coherent(&priv->pdev->dev,
+				TSI721_MSG_BUFFER_SIZE,
+				priv->omsg_ring[mbox].omq_base[i],
+				priv->omsg_ring[mbox].omq_phys[i]);
+
+			priv->omsg_ring[mbox].omq_base[i] = NULL;
+		}
+	}
+
+out:
+	return rc;
+}
+
+/**
+ * tsi721_close_outb_mbox - Close Tsi721 outbound mailbox
+ * @mport: Master port implementing the outbound message unit
+ * @mbox: Mailbox to close
+ */
+static void tsi721_close_outb_mbox(struct rio_mport *mport, int mbox)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 i;
+
+	if (!priv->omsg_init[mbox])
+		return;
+	priv->omsg_init[mbox] = 0;
+
+	/* Disable Interrupts */
+
+	tsi721_omsg_interrupt_disable(priv, mbox, TSI721_OBDMAC_INT_ALL);
+
+#ifdef CONFIG_PCI_MSI
+	if (priv->flags & TSI721_USING_MSIX) {
+		free_irq(priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector,
+			 (void *)mport);
+		free_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector,
+			 (void *)mport);
+	}
+#endif /* CONFIG_PCI_MSI */
+
+	/* Free OMSG Descriptor Status FIFO */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->omsg_ring[mbox].sts_size * sizeof(struct tsi721_dma_sts),
+		priv->omsg_ring[mbox].sts_base,
+		priv->omsg_ring[mbox].sts_phys);
+
+	priv->omsg_ring[mbox].sts_base = NULL;
+
+	/* Free OMSG descriptors */
+	dma_free_coherent(&priv->pdev->dev,
+		(priv->omsg_ring[mbox].size + 1) *
+			sizeof(struct tsi721_omsg_desc),
+		priv->omsg_ring[mbox].omd_base,
+		priv->omsg_ring[mbox].omd_phys);
+
+	priv->omsg_ring[mbox].omd_base = NULL;
+
+	/* Free message buffers */
+	for (i = 0; i < priv->omsg_ring[mbox].size; i++) {
+		if (priv->omsg_ring[mbox].omq_base[i]) {
+			dma_free_coherent(&priv->pdev->dev,
+				TSI721_MSG_BUFFER_SIZE,
+				priv->omsg_ring[mbox].omq_base[i],
+				priv->omsg_ring[mbox].omq_phys[i]);
+
+			priv->omsg_ring[mbox].omq_base[i] = NULL;
+		}
+	}
+}
+
+/**
+ * tsi721_imsg_handler - Inbound Message Interrupt Handler
+ * @priv: pointer to tsi721 private data
+ * @ch: inbound message channel number to service
+ *
+ * Services channel interrupts from inbound messaging engine.
+ */
+static void tsi721_imsg_handler(struct tsi721_device *priv, int ch)
+{
+	u32 mbox = ch - 4;
+	u32 imsg_int;
+
+	spin_lock(&priv->imsg_ring[mbox].lock);
+
+	imsg_int = ioread32(priv->regs + TSI721_IBDMAC_INT(ch));
+
+	if (imsg_int & TSI721_IBDMAC_INT_SRTO)
+		dev_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout\n",
+			mbox);
+
+	if (imsg_int & TSI721_IBDMAC_INT_PC_ERROR)
+		dev_info(&priv->pdev->dev, "IB MBOX%d PCIe error\n",
+			mbox);
+
+	if (imsg_int & TSI721_IBDMAC_INT_FQ_LOW)
+		dev_info(&priv->pdev->dev,
+			"IB MBOX%d IB free queue low\n", mbox);
+
+	/* Clear IB channel interrupts */
+	iowrite32(imsg_int, priv->regs + TSI721_IBDMAC_INT(ch));
+
+	/* If an IB Msg is received notify the upper layer */
+	if (imsg_int & TSI721_IBDMAC_INT_DQ_RCV &&
+		priv->mport->inb_msg[mbox].mcback)
+		priv->mport->inb_msg[mbox].mcback(priv->mport,
+				priv->imsg_ring[mbox].dev_id, mbox, -1);
+
+	if (!(priv->flags & TSI721_USING_MSIX)) {
+		u32 ch_inte;
+
+		/* Re-enable channel interrupts */
+		ch_inte = ioread32(priv->regs + TSI721_DEV_CHAN_INTE);
+		ch_inte |= TSI721_INT_IMSG_CHAN(ch);
+		iowrite32(ch_inte, priv->regs + TSI721_DEV_CHAN_INTE);
+	}
+
+	spin_unlock(&priv->imsg_ring[mbox].lock);
+}
+
+/**
+ * tsi721_open_inb_mbox - Initialize Tsi721 inbound mailbox
+ * @mport: Master port implementing the Inbound Messaging Engine
+ * @dev_id: Device specific pointer to pass on event
+ * @mbox: Mailbox to open
+ * @entries: Number of entries in the inbound mailbox ring
+ */
+static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id,
+				int mbox, int entries)
+{
+	struct tsi721_device *priv = mport->priv;
+	int ch = mbox + 4;
+	int i;
+	u64 *free_ptr;
+	int rc = 0;
+
+	if ((entries < TSI721_IMSGD_MIN_RING_SIZE) ||
+	    (entries > TSI721_IMSGD_RING_SIZE) ||
+	    (!is_power_of_2(entries)) || mbox >= RIO_MAX_MBOX) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* Initialize IB Messaging Ring */
+	priv->imsg_ring[mbox].dev_id = dev_id;
+	priv->imsg_ring[mbox].size = entries;
+	priv->imsg_ring[mbox].rx_slot = 0;
+	priv->imsg_ring[mbox].desc_rdptr = 0;
+	priv->imsg_ring[mbox].fq_wrptr = 0;
+	for (i = 0; i < priv->imsg_ring[mbox].size; i++)
+		priv->imsg_ring[mbox].imq_base[i] = NULL;
+	spin_lock_init(&priv->imsg_ring[mbox].lock);
+
+	/* Allocate buffers for incoming messages */
+	priv->imsg_ring[mbox].buf_base =
+		dma_alloc_coherent(&priv->pdev->dev,
+				   entries * TSI721_MSG_BUFFER_SIZE,
+				   &priv->imsg_ring[mbox].buf_phys,
+				   GFP_KERNEL);
+
+	if (priv->imsg_ring[mbox].buf_base == NULL) {
+		dev_err(&priv->pdev->dev,
+			"Failed to allocate buffers for IB MBOX%d\n", mbox);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Allocate memory for circular free list */
+	priv->imsg_ring[mbox].imfq_base =
+		dma_alloc_coherent(&priv->pdev->dev,
+				   entries * 8,
+				   &priv->imsg_ring[mbox].imfq_phys,
+				   GFP_KERNEL);
+
+	if (priv->imsg_ring[mbox].imfq_base == NULL) {
+		dev_err(&priv->pdev->dev,
+			"Failed to allocate free queue for IB MBOX%d\n", mbox);
+		rc = -ENOMEM;
+		goto out_buf;
+	}
+
+	/* Allocate memory for Inbound message descriptors */
+	priv->imsg_ring[mbox].imd_base =
+		dma_alloc_coherent(&priv->pdev->dev,
+				   entries * sizeof(struct tsi721_imsg_desc),
+				   &priv->imsg_ring[mbox].imd_phys, GFP_KERNEL);
+
+	if (priv->imsg_ring[mbox].imd_base == NULL) {
+		dev_err(&priv->pdev->dev,
+			"Failed to allocate descriptor memory for IB MBOX%d\n",
+			mbox);
+		rc = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* Fill free buffer pointer list */
+	free_ptr = priv->imsg_ring[mbox].imfq_base;
+	for (i = 0; i < entries; i++)
+		free_ptr[i] = cpu_to_le64(
+				(u64)(priv->imsg_ring[mbox].buf_phys) +
+				i * 0x1000);
+
+	mb();
+
+	/*
+	 * For mapping of inbound SRIO Messages into appropriate queues we need
+	 * to set Inbound Device ID register in the messaging engine. We do it
+	 * once when first inbound mailbox is requested.
+	 */
+	if (!(priv->flags & TSI721_IMSGID_SET)) {
+		iowrite32((u32)priv->mport->host_deviceid,
+			priv->regs + TSI721_IB_DEVID);
+		priv->flags |= TSI721_IMSGID_SET;
+	}
+
+	/*
+	 * Configure Inbound Messaging channel (ch = mbox + 4)
+	 */
+
+	/* Setup Inbound Message free queue */
+	iowrite32(((u64)priv->imsg_ring[mbox].imfq_phys >> 32),
+		priv->regs + TSI721_IBDMAC_FQBH(ch));
+	iowrite32(((u64)priv->imsg_ring[mbox].imfq_phys &
+			TSI721_IBDMAC_FQBL_MASK),
+		priv->regs+TSI721_IBDMAC_FQBL(ch));
+	iowrite32(TSI721_DMAC_DSSZ_SIZE(entries),
+		priv->regs + TSI721_IBDMAC_FQSZ(ch));
+
+	/* Setup Inbound Message descriptor queue */
+	iowrite32(((u64)priv->imsg_ring[mbox].imd_phys >> 32),
+		priv->regs + TSI721_IBDMAC_DQBH(ch));
+	iowrite32(((u32)priv->imsg_ring[mbox].imd_phys &
+		   (u32)TSI721_IBDMAC_DQBL_MASK),
+		priv->regs+TSI721_IBDMAC_DQBL(ch));
+	iowrite32(TSI721_DMAC_DSSZ_SIZE(entries),
+		priv->regs + TSI721_IBDMAC_DQSZ(ch));
+
+	/* Enable interrupts */
+
+#ifdef CONFIG_PCI_MSI
+	if (priv->flags & TSI721_USING_MSIX) {
+		/* Request interrupt service if we are in MSI-X mode */
+		rc = request_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
+			tsi721_imsg_msix, 0,
+			priv->msix[TSI721_VECT_IMB0_RCV + mbox].irq_name,
+			(void *)mport);
+
+		if (rc) {
+			dev_dbg(&priv->pdev->dev,
+				"Unable to allocate MSI-X interrupt for "
+				"IBOX%d-DONE\n", mbox);
+			goto out_desc;
+		}
+
+		rc = request_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector,
+			tsi721_imsg_msix, 0,
+			priv->msix[TSI721_VECT_IMB0_INT + mbox].irq_name,
+			(void *)mport);
+
+		if (rc)	{
+			dev_dbg(&priv->pdev->dev,
+				"Unable to allocate MSI-X interrupt for "
+				"IBOX%d-INT\n", mbox);
+			free_irq(
+				priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
+				(void *)mport);
+			goto out_desc;
+		}
+	}
+#endif /* CONFIG_PCI_MSI */
+
+	tsi721_imsg_interrupt_enable(priv, ch, TSI721_IBDMAC_INT_ALL);
+
+	/* Initialize Inbound Message Engine */
+	iowrite32(TSI721_IBDMAC_CTL_INIT, priv->regs + TSI721_IBDMAC_CTL(ch));
+	ioread32(priv->regs + TSI721_IBDMAC_CTL(ch));
+	udelay(10);
+	priv->imsg_ring[mbox].fq_wrptr = entries - 1;
+	iowrite32(entries - 1, priv->regs + TSI721_IBDMAC_FQWP(ch));
+
+	priv->imsg_init[mbox] = 1;
+	return 0;
+
+#ifdef CONFIG_PCI_MSI
+out_desc:
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * sizeof(struct tsi721_imsg_desc),
+		priv->imsg_ring[mbox].imd_base,
+		priv->imsg_ring[mbox].imd_phys);
+
+	priv->imsg_ring[mbox].imd_base = NULL;
+#endif /* CONFIG_PCI_MSI */
+
+out_dma:
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * 8,
+		priv->imsg_ring[mbox].imfq_base,
+		priv->imsg_ring[mbox].imfq_phys);
+
+	priv->imsg_ring[mbox].imfq_base = NULL;
+
+out_buf:
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * TSI721_MSG_BUFFER_SIZE,
+		priv->imsg_ring[mbox].buf_base,
+		priv->imsg_ring[mbox].buf_phys);
+
+	priv->imsg_ring[mbox].buf_base = NULL;
+
+out:
+	return rc;
+}
+
+/**
+ * tsi721_close_inb_mbox - Shut down Tsi721 inbound mailbox
+ * @mport: Master port implementing the Inbound Messaging Engine
+ * @mbox: Mailbox to close
+ */
+static void tsi721_close_inb_mbox(struct rio_mport *mport, int mbox)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 rx_slot;
+	int ch = mbox + 4;
+
+	if (!priv->imsg_init[mbox]) /* mbox isn't initialized yet */
+		return;
+	priv->imsg_init[mbox] = 0;
+
+	/* Disable Inbound Messaging Engine */
+
+	/* Disable Interrupts */
+	tsi721_imsg_interrupt_disable(priv, ch, TSI721_OBDMAC_INT_MASK);
+
+#ifdef CONFIG_PCI_MSI
+	if (priv->flags & TSI721_USING_MSIX) {
+		free_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector,
+				(void *)mport);
+		free_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector,
+				(void *)mport);
+	}
+#endif /* CONFIG_PCI_MSI */
+
+	/* Clear Inbound Buffer Queue */
+	for (rx_slot = 0; rx_slot < priv->imsg_ring[mbox].size; rx_slot++)
+		priv->imsg_ring[mbox].imq_base[rx_slot] = NULL;
+
+	/* Free memory allocated for message buffers */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * TSI721_MSG_BUFFER_SIZE,
+		priv->imsg_ring[mbox].buf_base,
+		priv->imsg_ring[mbox].buf_phys);
+
+	priv->imsg_ring[mbox].buf_base = NULL;
+
+	/* Free memory allocated for free pointr list */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * 8,
+		priv->imsg_ring[mbox].imfq_base,
+		priv->imsg_ring[mbox].imfq_phys);
+
+	priv->imsg_ring[mbox].imfq_base = NULL;
+
+	/* Free memory allocated for RX descriptors */
+	dma_free_coherent(&priv->pdev->dev,
+		priv->imsg_ring[mbox].size * sizeof(struct tsi721_imsg_desc),
+		priv->imsg_ring[mbox].imd_base,
+		priv->imsg_ring[mbox].imd_phys);
+
+	priv->imsg_ring[mbox].imd_base = NULL;
+}
+
+/**
+ * tsi721_add_inb_buffer - Add buffer to the Tsi721 inbound message queue
+ * @mport: Master port implementing the Inbound Messaging Engine
+ * @mbox: Inbound mailbox number
+ * @buf: Buffer to add to inbound queue
+ */
+static int tsi721_add_inb_buffer(struct rio_mport *mport, int mbox, void *buf)
+{
+	struct tsi721_device *priv = mport->priv;
+	u32 rx_slot;
+	int rc = 0;
+
+	rx_slot = priv->imsg_ring[mbox].rx_slot;
+	if (priv->imsg_ring[mbox].imq_base[rx_slot]) {
+		dev_err(&priv->pdev->dev,
+			"Error adding inbound buffer %d, buffer exists\n",
+			rx_slot);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	priv->imsg_ring[mbox].imq_base[rx_slot] = buf;
+
+	if (++priv->imsg_ring[mbox].rx_slot == priv->imsg_ring[mbox].size)
+		priv->imsg_ring[mbox].rx_slot = 0;
+
+out:
+	return rc;
+}
+
+/**
+ * tsi721_get_inb_message - Fetch inbound message from the Tsi721 MSG Queue
+ * @mport: Master port implementing the Inbound Messaging Engine
+ * @mbox: Inbound mailbox number
+ *
+ * Returns pointer to the message on success or NULL on failure.
+ */
+static void *tsi721_get_inb_message(struct rio_mport *mport, int mbox)
+{
+	struct tsi721_device *priv = mport->priv;
+	struct tsi721_imsg_desc *desc;
+	u32 rx_slot;
+	void *rx_virt = NULL;
+	u64 rx_phys;
+	void *buf = NULL;
+	u64 *free_ptr;
+	int ch = mbox + 4;
+	int msg_size;
+
+	if (!priv->imsg_init[mbox])
+		return NULL;
+
+	desc = priv->imsg_ring[mbox].imd_base;
+	desc += priv->imsg_ring[mbox].desc_rdptr;
+
+	if (!(le32_to_cpu(desc->msg_info) & TSI721_IMD_HO))
+		goto out;
+
+	rx_slot = priv->imsg_ring[mbox].rx_slot;
+	while (priv->imsg_ring[mbox].imq_base[rx_slot] == NULL) {
+		if (++rx_slot == priv->imsg_ring[mbox].size)
+			rx_slot = 0;
+	}
+
+	rx_phys = ((u64)le32_to_cpu(desc->bufptr_hi) << 32) |
+			le32_to_cpu(desc->bufptr_lo);
+
+	rx_virt = priv->imsg_ring[mbox].buf_base +
+		  (rx_phys - (u64)priv->imsg_ring[mbox].buf_phys);
+
+	buf = priv->imsg_ring[mbox].imq_base[rx_slot];
+	msg_size = le32_to_cpu(desc->msg_info) & TSI721_IMD_BCOUNT;
+	if (msg_size == 0)
+		msg_size = RIO_MAX_MSG_SIZE;
+
+	memcpy(buf, rx_virt, msg_size);
+	priv->imsg_ring[mbox].imq_base[rx_slot] = NULL;
+
+	desc->msg_info &= cpu_to_le32(~TSI721_IMD_HO);
+	if (++priv->imsg_ring[mbox].desc_rdptr == priv->imsg_ring[mbox].size)
+		priv->imsg_ring[mbox].desc_rdptr = 0;
+
+	iowrite32(priv->imsg_ring[mbox].desc_rdptr,
+		priv->regs + TSI721_IBDMAC_DQRP(ch));
+
+	/* Return free buffer into the pointer list */
+	free_ptr = priv->imsg_ring[mbox].imfq_base;
+	free_ptr[priv->imsg_ring[mbox].fq_wrptr] = cpu_to_le64(rx_phys);
+
+	if (++priv->imsg_ring[mbox].fq_wrptr == priv->imsg_ring[mbox].size)
+		priv->imsg_ring[mbox].fq_wrptr = 0;
+
+	iowrite32(priv->imsg_ring[mbox].fq_wrptr,
+		priv->regs + TSI721_IBDMAC_FQWP(ch));
+out:
+	return buf;
+}
+
+/**
+ * tsi721_messages_init - Initialization of Messaging Engine
+ * @priv: pointer to tsi721 private data
+ *
+ * Configures Tsi721 messaging engine.
+ */
+static int tsi721_messages_init(struct tsi721_device *priv)
+{
+	int	ch;
+
+	iowrite32(0, priv->regs + TSI721_SMSG_ECC_LOG);
+	iowrite32(0, priv->regs + TSI721_RETRY_GEN_CNT);
+	iowrite32(0, priv->regs + TSI721_RETRY_RX_CNT);
+
+	/* Set SRIO Message Request/Response Timeout */
+	iowrite32(TSI721_RQRPTO_VAL, priv->regs + TSI721_RQRPTO);
+
+	/* Initialize Inbound Messaging Engine Registers */
+	for (ch = 0; ch < TSI721_IMSG_CHNUM; ch++) {
+		/* Clear interrupt bits */
+		iowrite32(TSI721_IBDMAC_INT_MASK,
+			priv->regs + TSI721_IBDMAC_INT(ch));
+		/* Clear Status */
+		iowrite32(0, priv->regs + TSI721_IBDMAC_STS(ch));
+
+		iowrite32(TSI721_SMSG_ECC_COR_LOG_MASK,
+				priv->regs + TSI721_SMSG_ECC_COR_LOG(ch));
+		iowrite32(TSI721_SMSG_ECC_NCOR_MASK,
+				priv->regs + TSI721_SMSG_ECC_NCOR(ch));
+	}
+
+	return 0;
+}
+
+/**
+ * tsi721_disable_ints - disables all device interrupts
+ * @priv: pointer to tsi721 private data
+ */
+static void tsi721_disable_ints(struct tsi721_device *priv)
+{
+	int ch;
+
+	/* Disable all device level interrupts */
+	iowrite32(0, priv->regs + TSI721_DEV_INTE);
+
+	/* Disable all Device Channel interrupts */
+	iowrite32(0, priv->regs + TSI721_DEV_CHAN_INTE);
+
+	/* Disable all Inbound Msg Channel interrupts */
+	for (ch = 0; ch < TSI721_IMSG_CHNUM; ch++)
+		iowrite32(0, priv->regs + TSI721_IBDMAC_INTE(ch));
+
+	/* Disable all Outbound Msg Channel interrupts */
+	for (ch = 0; ch < TSI721_OMSG_CHNUM; ch++)
+		iowrite32(0, priv->regs + TSI721_OBDMAC_INTE(ch));
+
+	/* Disable all general messaging interrupts */
+	iowrite32(0, priv->regs + TSI721_SMSG_INTE);
+
+	/* Disable all BDMA Channel interrupts */
+	for (ch = 0; ch < TSI721_DMA_MAXCH; ch++)
+		iowrite32(0, priv->regs + TSI721_DMAC_INTE(ch));
+
+	/* Disable all general BDMA interrupts */
+	iowrite32(0, priv->regs + TSI721_BDMA_INTE);
+
+	/* Disable all SRIO Channel interrupts */
+	for (ch = 0; ch < TSI721_SRIO_MAXCH; ch++)
+		iowrite32(0, priv->regs + TSI721_SR_CHINTE(ch));
+
+	/* Disable all general SR2PC interrupts */
+	iowrite32(0, priv->regs + TSI721_SR2PC_GEN_INTE);
+
+	/* Disable all PC2SR interrupts */
+	iowrite32(0, priv->regs + TSI721_PC2SR_INTE);
+
+	/* Disable all I2C interrupts */
+	iowrite32(0, priv->regs + TSI721_I2C_INT_ENABLE);
+
+	/* Disable SRIO MAC interrupts */
+	iowrite32(0, priv->regs + TSI721_RIO_EM_INT_ENABLE);
+	iowrite32(0, priv->regs + TSI721_RIO_EM_DEV_INT_EN);
+}
+
+/**
+ * tsi721_setup_mport - Setup Tsi721 as RapidIO subsystem master port
+ * @priv: pointer to tsi721 private data
+ *
+ * Configures Tsi721 as RapidIO master port.
+ */
+static int __devinit tsi721_setup_mport(struct tsi721_device *priv)
+{
+	struct pci_dev *pdev = priv->pdev;
+	int err = 0;
+	struct rio_ops *ops;
+
+	struct rio_mport *mport;
+
+	ops = kzalloc(sizeof(struct rio_ops), GFP_KERNEL);
+	if (!ops) {
+		dev_dbg(&pdev->dev, "Unable to allocate memory for rio_ops\n");
+		return -ENOMEM;
+	}
+
+	ops->lcread = tsi721_lcread;
+	ops->lcwrite = tsi721_lcwrite;
+	ops->cread = tsi721_cread_dma;
+	ops->cwrite = tsi721_cwrite_dma;
+	ops->dsend = tsi721_dsend;
+	ops->open_inb_mbox = tsi721_open_inb_mbox;
+	ops->close_inb_mbox = tsi721_close_inb_mbox;
+	ops->open_outb_mbox = tsi721_open_outb_mbox;
+	ops->close_outb_mbox = tsi721_close_outb_mbox;
+	ops->add_outb_message = tsi721_add_outb_message;
+	ops->add_inb_buffer = tsi721_add_inb_buffer;
+	ops->get_inb_message = tsi721_get_inb_message;
+
+	mport = kzalloc(sizeof(struct rio_mport), GFP_KERNEL);
+	if (!mport) {
+		kfree(ops);
+		dev_dbg(&pdev->dev, "Unable to allocate memory for mport\n");
+		return -ENOMEM;
+	}
+
+	mport->ops = ops;
+	mport->index = 0;
+	mport->sys_size = 0; /* small system */
+	mport->phy_type = RIO_PHY_SERIAL;
+	mport->priv = (void *)priv;
+	mport->phys_efptr = 0x100;
+
+	INIT_LIST_HEAD(&mport->dbells);
+
+	rio_init_dbell_res(&mport->riores[RIO_DOORBELL_RESOURCE], 0, 0xffff);
+	rio_init_mbox_res(&mport->riores[RIO_INB_MBOX_RESOURCE], 0, 0);
+	rio_init_mbox_res(&mport->riores[RIO_OUTB_MBOX_RESOURCE], 0, 0);
+	strcpy(mport->name, "Tsi721 mport");
+
+	/* Hook up interrupt handler */
+
+#ifdef CONFIG_PCI_MSI
+	if (!tsi721_enable_msix(priv))
+		priv->flags |= TSI721_USING_MSIX;
+	else if (!pci_enable_msi(pdev))
+		priv->flags |= TSI721_USING_MSI;
+	else
+		dev_info(&pdev->dev,
+			 "MSI/MSI-X is not available. Using legacy INTx.\n");
+#endif /* CONFIG_PCI_MSI */
+
+	err = tsi721_request_irq(mport);
+
+	if (!err) {
+		tsi721_interrupts_init(priv);
+		ops->pwenable = tsi721_pw_enable;
+	} else
+		dev_err(&pdev->dev, "Unable to get assigned PCI IRQ "
+			"vector %02X err=0x%x\n", pdev->irq, err);
+
+	/* Enable SRIO link */
+	iowrite32(ioread32(priv->regs + TSI721_DEVCTL) |
+		  TSI721_DEVCTL_SRBOOT_CMPL,
+		  priv->regs + TSI721_DEVCTL);
+
+	rio_register_mport(mport);
+	priv->mport = mport;
+
+	if (mport->host_deviceid >= 0)
+		iowrite32(RIO_PORT_GEN_HOST | RIO_PORT_GEN_MASTER |
+			  RIO_PORT_GEN_DISCOVERED,
+			  priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR));
+	else
+		iowrite32(0, priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR));
+
+	return 0;
+}
+
+static int __devinit tsi721_probe(struct pci_dev *pdev,
+				  const struct pci_device_id *id)
+{
+	struct tsi721_device *priv;
+	int i;
+	int err;
+	u32 regval;
+
+	priv = kzalloc(sizeof(struct tsi721_device), GFP_KERNEL);
+	if (priv == NULL) {
+		dev_err(&pdev->dev, "Failed to allocate memory for device\n");
+		err = -ENOMEM;
+		goto err_exit;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to enable PCI device\n");
+		goto err_clean;
+	}
+
+	priv->pdev = pdev;
+
+#ifdef DEBUG
+	for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+		dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n",
+			i, (unsigned long long)pci_resource_start(pdev, i),
+			(unsigned long)pci_resource_len(pdev, i),
+			pci_resource_flags(pdev, i));
+	}
+#endif
+	/*
+	 * Verify BAR configuration
+	 */
+
+	/* BAR_0 (registers) must be 512KB+ in 32-bit address space */
+	if (!(pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM) ||
+	    pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM_64 ||
+	    pci_resource_len(pdev, BAR_0) < TSI721_REG_SPACE_SIZE) {
+		dev_err(&pdev->dev,
+			"Missing or misconfigured CSR BAR0, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	/* BAR_1 (outbound doorbells) must be 16MB+ in 32-bit address space */
+	if (!(pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM) ||
+	    pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM_64 ||
+	    pci_resource_len(pdev, BAR_1) < TSI721_DB_WIN_SIZE) {
+		dev_err(&pdev->dev,
+			"Missing or misconfigured Doorbell BAR1, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	/*
+	 * BAR_2 and BAR_4 (outbound translation) must be in 64-bit PCIe address
+	 * space.
+	 * NOTE: BAR_2 and BAR_4 are not used by this version of driver.
+	 * It may be a good idea to keep them disabled using HW configuration
+	 * to save PCI memory space.
+	 */
+	if ((pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM) &&
+	    (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64)) {
+		dev_info(&pdev->dev, "Outbound BAR2 is not used but enabled.\n");
+	}
+
+	if ((pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM) &&
+	    (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64)) {
+		dev_info(&pdev->dev, "Outbound BAR4 is not used but enabled.\n");
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	priv->regs = pci_ioremap_bar(pdev, BAR_0);
+	if (!priv->regs) {
+		dev_err(&pdev->dev,
+			"Unable to map device registers space, aborting\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	priv->odb_base = pci_ioremap_bar(pdev, BAR_1);
+	if (!priv->odb_base) {
+		dev_err(&pdev->dev,
+			"Unable to map outbound doorbells space, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_bars;
+	}
+
+	/* Configure DMA attributes. */
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+			dev_info(&pdev->dev, "Unable to set DMA mask\n");
+			goto err_unmap_bars;
+		}
+
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
+			dev_info(&pdev->dev, "Unable to set consistent DMA mask\n");
+	} else {
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err)
+			dev_info(&pdev->dev, "Unable to set consistent DMA mask\n");
+	}
+
+	/* Clear "no snoop" and "relaxed ordering" bits. */
+	pci_read_config_dword(pdev, 0x40 + PCI_EXP_DEVCTL, &regval);
+	regval &= ~(PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN);
+	pci_write_config_dword(pdev, 0x40 + PCI_EXP_DEVCTL, regval);
+
+	/*
+	 * FIXUP: correct offsets of MSI-X tables in the MSI-X Capability Block
+	 */
+	pci_write_config_dword(pdev, TSI721_PCIECFG_EPCTL, 0x01);
+	pci_write_config_dword(pdev, TSI721_PCIECFG_MSIXTBL,
+						TSI721_MSIXTBL_OFFSET);
+	pci_write_config_dword(pdev, TSI721_PCIECFG_MSIXPBA,
+						TSI721_MSIXPBA_OFFSET);
+	pci_write_config_dword(pdev, TSI721_PCIECFG_EPCTL, 0);
+	/* End of FIXUP */
+
+	tsi721_disable_ints(priv);
+
+	tsi721_init_pc2sr_mapping(priv);
+	tsi721_init_sr2pc_mapping(priv);
+
+	if (tsi721_bdma_init(priv)) {
+		dev_err(&pdev->dev, "BDMA initialization failed, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_bars;
+	}
+
+	err = tsi721_doorbell_init(priv);
+	if (err)
+		goto err_free_bdma;
+
+	tsi721_port_write_init(priv);
+
+	err = tsi721_messages_init(priv);
+	if (err)
+		goto err_free_consistent;
+
+	err = tsi721_setup_mport(priv);
+	if (err)
+		goto err_free_consistent;
+
+	return 0;
+
+err_free_consistent:
+	tsi721_doorbell_free(priv);
+err_free_bdma:
+	tsi721_bdma_free(priv);
+err_unmap_bars:
+	if (priv->regs)
+		iounmap(priv->regs);
+	if (priv->odb_base)
+		iounmap(priv->odb_base);
+err_free_res:
+	pci_release_regions(pdev);
+	pci_clear_master(pdev);
+err_disable_pdev:
+	pci_disable_device(pdev);
+err_clean:
+	kfree(priv);
+err_exit:
+	return err;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(tsi721_pci_tbl) = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_IDT, PCI_DEVICE_ID_TSI721) },
+	{ 0, }	/* terminate list */
+};
+
+MODULE_DEVICE_TABLE(pci, tsi721_pci_tbl);
+
+static struct pci_driver tsi721_driver = {
+	.name		= "tsi721",
+	.id_table	= tsi721_pci_tbl,
+	.probe		= tsi721_probe,
+};
+
+static int __init tsi721_init(void)
+{
+	return pci_register_driver(&tsi721_driver);
+}
+
+static void __exit tsi721_exit(void)
+{
+	pci_unregister_driver(&tsi721_driver);
+}
+
+device_initcall(tsi721_init);
diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h
new file mode 100644
index 0000000..58be4de
--- /dev/null
+++ b/drivers/rapidio/devices/tsi721.h
@@ -0,0 +1,766 @@
+/*
+ * Tsi721 PCIExpress-to-SRIO bridge definitions
+ *
+ * Copyright 2011, Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+#ifndef __TSI721_H
+#define __TSI721_H
+
+#define DRV_NAME	"tsi721"
+
+#define DEFAULT_HOPCOUNT	0xff
+#define DEFAULT_DESTID		0xff
+
+/* PCI device ID */
+#define PCI_DEVICE_ID_TSI721		0x80ab
+
+#define BAR_0	0
+#define BAR_1	1
+#define BAR_2	2
+#define BAR_4	4
+
+#define TSI721_PC2SR_BARS	2
+#define TSI721_PC2SR_WINS	8
+#define TSI721_PC2SR_ZONES	8
+#define TSI721_MAINT_WIN	0 /* Window for outbound maintenance requests */
+#define IDB_QUEUE		0 /* Inbound Doorbell Queue to use */
+#define IDB_QSIZE		512 /* Inbound Doorbell Queue size */
+
+/* Memory space sizes */
+#define TSI721_REG_SPACE_SIZE		(512 * 1024) /* 512K */
+#define TSI721_DB_WIN_SIZE		(16 * 1024 * 1024) /* 16MB */
+
+#define  RIO_TT_CODE_8		0x00000000
+#define  RIO_TT_CODE_16		0x00000001
+
+#define TSI721_DMA_MAXCH	8
+#define TSI721_DMA_MINSTSSZ	32
+#define TSI721_DMA_STSBLKSZ	8
+
+#define TSI721_SRIO_MAXCH	8
+
+#define DBELL_SID(buf)		(((u8)buf[2] << 8) | (u8)buf[3])
+#define DBELL_TID(buf)		(((u8)buf[4] << 8) | (u8)buf[5])
+#define DBELL_INF(buf)		(((u8)buf[0] << 8) | (u8)buf[1])
+
+#define TSI721_RIO_PW_MSG_SIZE	16  /* Tsi721 saves only 16 bytes of PW msg */
+
+/* Register definitions */
+
+/*
+ * Registers in PCIe configuration space
+ */
+
+#define TSI721_PCIECFG_MSIXTBL	0x0a4
+#define TSI721_MSIXTBL_OFFSET	0x2c000
+#define TSI721_PCIECFG_MSIXPBA	0x0a8
+#define TSI721_MSIXPBA_OFFSET	0x2a000
+#define TSI721_PCIECFG_EPCTL	0x400
+
+/*
+ * Event Management Registers
+ */
+
+#define TSI721_RIO_EM_INT_STAT		0x10910
+#define TSI721_RIO_EM_INT_STAT_PW_RX	0x00010000
+
+#define TSI721_RIO_EM_INT_ENABLE	0x10914
+#define TSI721_RIO_EM_INT_ENABLE_PW_RX	0x00010000
+
+#define TSI721_RIO_EM_DEV_INT_EN	0x10930
+#define TSI721_RIO_EM_DEV_INT_EN_INT	0x00000001
+
+/*
+ * Port-Write Block Registers
+ */
+
+#define TSI721_RIO_PW_CTL		0x10a04
+#define TSI721_RIO_PW_CTL_PW_TIMER	0xf0000000
+#define TSI721_RIO_PW_CTL_PWT_DIS	(0 << 28)
+#define TSI721_RIO_PW_CTL_PWT_103	(1 << 28)
+#define TSI721_RIO_PW_CTL_PWT_205	(1 << 29)
+#define TSI721_RIO_PW_CTL_PWT_410	(1 << 30)
+#define TSI721_RIO_PW_CTL_PWT_820	(1 << 31)
+#define TSI721_RIO_PW_CTL_PWC_MODE	0x01000000
+#define TSI721_RIO_PW_CTL_PWC_CONT	0x00000000
+#define TSI721_RIO_PW_CTL_PWC_REL	0x01000000
+
+#define TSI721_RIO_PW_RX_STAT		0x10a10
+#define TSI721_RIO_PW_RX_STAT_WR_SIZE	0x0000f000
+#define TSI_RIO_PW_RX_STAT_WDPTR	0x00000100
+#define TSI721_RIO_PW_RX_STAT_PW_SHORT	0x00000008
+#define TSI721_RIO_PW_RX_STAT_PW_TRUNC	0x00000004
+#define TSI721_RIO_PW_RX_STAT_PW_DISC	0x00000002
+#define TSI721_RIO_PW_RX_STAT_PW_VAL	0x00000001
+
+#define TSI721_RIO_PW_RX_CAPT(x)	(0x10a20 + (x)*4)
+
+/*
+ * Inbound Doorbells
+ */
+
+#define TSI721_IDB_ENTRY_SIZE	64
+
+#define TSI721_IDQ_CTL(x)	(0x20000 + (x) * 1000)
+#define TSI721_IDQ_SUSPEND	0x00000002
+#define TSI721_IDQ_INIT		0x00000001
+
+#define TSI721_IDQ_STS(x)	(0x20004 + (x) * 1000)
+#define TSI721_IDQ_RUN		0x00200000
+
+#define TSI721_IDQ_MASK(x)	(0x20008 + (x) * 1000)
+#define TSI721_IDQ_MASK_MASK	0xffff0000
+#define TSI721_IDQ_MASK_PATT	0x0000ffff
+
+#define TSI721_IDQ_RP(x)	(0x2000c + (x) * 1000)
+#define TSI721_IDQ_RP_PTR	0x0007ffff
+
+#define TSI721_IDQ_WP(x)	(0x20010 + (x) * 1000)
+#define TSI721_IDQ_WP_PTR	0x0007ffff
+
+#define TSI721_IDQ_BASEL(x)	(0x20014 + (x) * 1000)
+#define TSI721_IDQ_BASEL_ADDR	0xffffffc0
+#define TSI721_IDQ_BASEU(x)	(0x20018 + (x) * 1000)
+#define TSI721_IDQ_SIZE(x)	(0x2001c + (x) * 1000)
+#define TSI721_IDQ_SIZE_VAL(size)	(__fls(size) - 4)
+#define TSI721_IDQ_SIZE_MIN	512
+#define TSI721_IDQ_SIZE_MAX	(512 * 1024)
+
+#define TSI721_SR_CHINT(x)	(0x20040 + (x) * 1000)
+#define TSI721_SR_CHINTE(x)	(0x20044 + (x) * 1000)
+#define TSI721_SR_CHINTSET(x)	(0x20048 + (x) * 1000)
+#define TSI721_SR_CHINT_ODBOK	0x00000020
+#define TSI721_SR_CHINT_IDBQRCV	0x00000010
+#define TSI721_SR_CHINT_SUSP	0x00000008
+#define TSI721_SR_CHINT_ODBTO	0x00000004
+#define TSI721_SR_CHINT_ODBRTRY	0x00000002
+#define TSI721_SR_CHINT_ODBERR	0x00000001
+#define TSI721_SR_CHINT_ALL	0x0000003f
+
+#define TSI721_IBWIN_NUM	8
+
+#define TSI721_IBWINLB(x)	(0x29000 + (x) * 20)
+#define TSI721_IBWINLB_BA	0xfffff000
+#define TSI721_IBWINLB_WEN	0x00000001
+
+#define TSI721_SR2PC_GEN_INTE	0x29800
+#define TSI721_SR2PC_PWE	0x29804
+#define TSI721_SR2PC_GEN_INT	0x29808
+
+#define TSI721_DEV_INTE		0x29840
+#define TSI721_DEV_INT		0x29844
+#define TSI721_DEV_INTSET	0x29848
+#define TSI721_DEV_INT_SMSG_CH	0x00000800
+#define TSI721_DEV_INT_SMSG_NCH	0x00000400
+#define TSI721_DEV_INT_SR2PC_CH	0x00000200
+#define TSI721_DEV_INT_SRIO	0x00000020
+
+#define TSI721_DEV_CHAN_INTE	0x2984c
+#define TSI721_DEV_CHAN_INT	0x29850
+
+#define TSI721_INT_SR2PC_CHAN_M	0xff000000
+#define TSI721_INT_SR2PC_CHAN(x) (1 << (24 + (x)))
+#define TSI721_INT_IMSG_CHAN_M	0x00ff0000
+#define TSI721_INT_IMSG_CHAN(x)	(1 << (16 + (x)))
+#define TSI721_INT_OMSG_CHAN_M	0x0000ff00
+#define TSI721_INT_OMSG_CHAN(x)	(1 << (8 + (x)))
+
+/*
+ * PC2SR block registers
+ */
+#define TSI721_OBWIN_NUM	TSI721_PC2SR_WINS
+
+#define TSI721_OBWINLB(x)	(0x40000 + (x) * 20)
+#define TSI721_OBWINLB_BA	0xffff8000
+#define TSI721_OBWINLB_WEN	0x00000001
+
+#define TSI721_OBWINUB(x)	(0x40004 + (x) * 20)
+
+#define TSI721_OBWINSZ(x)	(0x40008 + (x) * 20)
+#define TSI721_OBWINSZ_SIZE	0x00001f00
+#define TSI721_OBWIN_SIZE(size)	(__fls(size) - 15)
+
+#define TSI721_ZONE_SEL		0x41300
+#define TSI721_ZONE_SEL_RD_WRB	0x00020000
+#define TSI721_ZONE_SEL_GO	0x00010000
+#define TSI721_ZONE_SEL_WIN	0x00000038
+#define TSI721_ZONE_SEL_ZONE	0x00000007
+
+#define TSI721_LUT_DATA0	0x41304
+#define TSI721_LUT_DATA0_ADD	0xfffff000
+#define TSI721_LUT_DATA0_RDTYPE	0x00000f00
+#define TSI721_LUT_DATA0_NREAD	0x00000100
+#define TSI721_LUT_DATA0_MNTRD	0x00000200
+#define TSI721_LUT_DATA0_RDCRF	0x00000020
+#define TSI721_LUT_DATA0_WRCRF	0x00000010
+#define TSI721_LUT_DATA0_WRTYPE	0x0000000f
+#define TSI721_LUT_DATA0_NWR	0x00000001
+#define TSI721_LUT_DATA0_MNTWR	0x00000002
+#define TSI721_LUT_DATA0_NWR_R	0x00000004
+
+#define TSI721_LUT_DATA1	0x41308
+
+#define TSI721_LUT_DATA2	0x4130c
+#define TSI721_LUT_DATA2_HC	0xff000000
+#define TSI721_LUT_DATA2_ADD65	0x000c0000
+#define TSI721_LUT_DATA2_TT	0x00030000
+#define TSI721_LUT_DATA2_DSTID	0x0000ffff
+
+#define TSI721_PC2SR_INTE	0x41310
+
+#define TSI721_DEVCTL		0x48004
+#define TSI721_DEVCTL_SRBOOT_CMPL	0x00000004
+
+#define TSI721_I2C_INT_ENABLE	0x49120
+
+/*
+ * Block DMA Engine Registers
+ *   x = 0..7
+ */
+
+#define TSI721_DMAC_DWRCNT(x)	(0x51000 + (x) * 0x1000)
+#define TSI721_DMAC_DRDCNT(x)	(0x51004 + (x) * 0x1000)
+
+#define TSI721_DMAC_CTL(x)	(0x51008 + (x) * 0x1000)
+#define TSI721_DMAC_CTL_SUSP	0x00000002
+#define TSI721_DMAC_CTL_INIT	0x00000001
+
+#define TSI721_DMAC_INT(x)	(0x5100c + (x) * 0x1000)
+#define TSI721_DMAC_INT_STFULL	0x00000010
+#define TSI721_DMAC_INT_DONE	0x00000008
+#define TSI721_DMAC_INT_SUSP	0x00000004
+#define TSI721_DMAC_INT_ERR	0x00000002
+#define TSI721_DMAC_INT_IOFDONE	0x00000001
+#define TSI721_DMAC_INT_ALL	0x0000001f
+
+#define TSI721_DMAC_INTSET(x)	(0x51010 + (x) * 0x1000)
+
+#define TSI721_DMAC_STS(x)	(0x51014 + (x) * 0x1000)
+#define TSI721_DMAC_STS_ABORT	0x00400000
+#define TSI721_DMAC_STS_RUN	0x00200000
+#define TSI721_DMAC_STS_CS	0x001f0000
+
+#define TSI721_DMAC_INTE(x)	(0x51018 + (x) * 0x1000)
+
+#define TSI721_DMAC_DPTRL(x)	(0x51024 + (x) * 0x1000)
+#define TSI721_DMAC_DPTRL_MASK	0xffffffe0
+
+#define TSI721_DMAC_DPTRH(x)	(0x51028 + (x) * 0x1000)
+
+#define TSI721_DMAC_DSBL(x)	(0x5102c + (x) * 0x1000)
+#define TSI721_DMAC_DSBL_MASK	0xffffffc0
+
+#define TSI721_DMAC_DSBH(x)	(0x51030 + (x) * 0x1000)
+
+#define TSI721_DMAC_DSSZ(x)	(0x51034 + (x) * 0x1000)
+#define TSI721_DMAC_DSSZ_SIZE_M	0x0000000f
+#define TSI721_DMAC_DSSZ_SIZE(size)	(__fls(size) - 4)
+
+
+#define TSI721_DMAC_DSRP(x)	(0x51038 + (x) * 0x1000)
+#define TSI721_DMAC_DSRP_MASK	0x0007ffff
+
+#define TSI721_DMAC_DSWP(x)	(0x5103c + (x) * 0x1000)
+#define TSI721_DMAC_DSWP_MASK	0x0007ffff
+
+#define TSI721_BDMA_INTE	0x5f000
+
+/*
+ * Messaging definitions
+ */
+#define TSI721_MSG_BUFFER_SIZE		RIO_MAX_MSG_SIZE
+#define TSI721_MSG_MAX_SIZE		RIO_MAX_MSG_SIZE
+#define TSI721_IMSG_MAXCH		8
+#define TSI721_IMSG_CHNUM		TSI721_IMSG_MAXCH
+#define TSI721_IMSGD_MIN_RING_SIZE	32
+#define TSI721_IMSGD_RING_SIZE		512
+
+#define TSI721_OMSG_CHNUM		4 /* One channel per MBOX */
+#define TSI721_OMSGD_MIN_RING_SIZE	32
+#define TSI721_OMSGD_RING_SIZE		512
+
+/*
+ * Outbound Messaging Engine Registers
+ *   x = 0..7
+ */
+
+#define TSI721_OBDMAC_DWRCNT(x)		(0x61000 + (x) * 0x1000)
+
+#define TSI721_OBDMAC_DRDCNT(x)		(0x61004 + (x) * 0x1000)
+
+#define TSI721_OBDMAC_CTL(x)		(0x61008 + (x) * 0x1000)
+#define TSI721_OBDMAC_CTL_MASK		0x00000007
+#define TSI721_OBDMAC_CTL_RETRY_THR	0x00000004
+#define TSI721_OBDMAC_CTL_SUSPEND	0x00000002
+#define TSI721_OBDMAC_CTL_INIT		0x00000001
+
+#define TSI721_OBDMAC_INT(x)		(0x6100c + (x) * 0x1000)
+#define TSI721_OBDMAC_INTSET(x)		(0x61010 + (x) * 0x1000)
+#define TSI721_OBDMAC_INTE(x)		(0x61018 + (x) * 0x1000)
+#define TSI721_OBDMAC_INT_MASK		0x0000001F
+#define TSI721_OBDMAC_INT_ST_FULL	0x00000010
+#define TSI721_OBDMAC_INT_DONE		0x00000008
+#define TSI721_OBDMAC_INT_SUSPENDED	0x00000004
+#define TSI721_OBDMAC_INT_ERROR		0x00000002
+#define TSI721_OBDMAC_INT_IOF_DONE	0x00000001
+#define TSI721_OBDMAC_INT_ALL		TSI721_OBDMAC_INT_MASK
+
+#define TSI721_OBDMAC_STS(x)		(0x61014 + (x) * 0x1000)
+#define TSI721_OBDMAC_STS_MASK		0x007f0000
+#define TSI721_OBDMAC_STS_ABORT		0x00400000
+#define TSI721_OBDMAC_STS_RUN		0x00200000
+#define TSI721_OBDMAC_STS_CS		0x001f0000
+
+#define TSI721_OBDMAC_PWE(x)		(0x6101c + (x) * 0x1000)
+#define TSI721_OBDMAC_PWE_MASK		0x00000002
+#define TSI721_OBDMAC_PWE_ERROR_EN	0x00000002
+
+#define TSI721_OBDMAC_DPTRL(x)		(0x61020 + (x) * 0x1000)
+#define TSI721_OBDMAC_DPTRL_MASK	0xfffffff0
+
+#define TSI721_OBDMAC_DPTRH(x)		(0x61024 + (x) * 0x1000)
+#define TSI721_OBDMAC_DPTRH_MASK	0xffffffff
+
+#define TSI721_OBDMAC_DSBL(x)		(0x61040 + (x) * 0x1000)
+#define TSI721_OBDMAC_DSBL_MASK		0xffffffc0
+
+#define TSI721_OBDMAC_DSBH(x)		(0x61044 + (x) * 0x1000)
+#define TSI721_OBDMAC_DSBH_MASK		0xffffffff
+
+#define TSI721_OBDMAC_DSSZ(x)		(0x61048 + (x) * 0x1000)
+#define TSI721_OBDMAC_DSSZ_MASK		0x0000000f
+
+#define TSI721_OBDMAC_DSRP(x)		(0x6104c + (x) * 0x1000)
+#define TSI721_OBDMAC_DSRP_MASK		0x0007ffff
+
+#define TSI721_OBDMAC_DSWP(x)		(0x61050 + (x) * 0x1000)
+#define TSI721_OBDMAC_DSWP_MASK		0x0007ffff
+
+#define TSI721_RQRPTO			0x60010
+#define TSI721_RQRPTO_MASK		0x00ffffff
+#define TSI721_RQRPTO_VAL		400	/* Response TO value */
+
+/*
+ * Inbound Messaging Engine Registers
+ *   x = 0..7
+ */
+
+#define TSI721_IB_DEVID_GLOBAL		0xffff
+#define TSI721_IBDMAC_FQBL(x)		(0x61200 + (x) * 0x1000)
+#define TSI721_IBDMAC_FQBL_MASK		0xffffffc0
+
+#define TSI721_IBDMAC_FQBH(x)		(0x61204 + (x) * 0x1000)
+#define TSI721_IBDMAC_FQBH_MASK		0xffffffff
+
+#define TSI721_IBDMAC_FQSZ_ENTRY_INX	TSI721_IMSGD_RING_SIZE
+#define TSI721_IBDMAC_FQSZ(x)		(0x61208 + (x) * 0x1000)
+#define TSI721_IBDMAC_FQSZ_MASK		0x0000000f
+
+#define TSI721_IBDMAC_FQRP(x)		(0x6120c + (x) * 0x1000)
+#define TSI721_IBDMAC_FQRP_MASK		0x0007ffff
+
+#define TSI721_IBDMAC_FQWP(x)		(0x61210 + (x) * 0x1000)
+#define TSI721_IBDMAC_FQWP_MASK		0x0007ffff
+
+#define TSI721_IBDMAC_FQTH(x)		(0x61214 + (x) * 0x1000)
+#define TSI721_IBDMAC_FQTH_MASK		0x0007ffff
+
+#define TSI721_IB_DEVID			0x60020
+#define TSI721_IB_DEVID_MASK		0x0000ffff
+
+#define TSI721_IBDMAC_CTL(x)		(0x61240 + (x) * 0x1000)
+#define TSI721_IBDMAC_CTL_MASK		0x00000003
+#define TSI721_IBDMAC_CTL_SUSPEND	0x00000002
+#define TSI721_IBDMAC_CTL_INIT		0x00000001
+
+#define TSI721_IBDMAC_STS(x)		(0x61244 + (x) * 0x1000)
+#define TSI721_IBDMAC_STS_MASK		0x007f0000
+#define TSI721_IBSMAC_STS_ABORT		0x00400000
+#define TSI721_IBSMAC_STS_RUN		0x00200000
+#define TSI721_IBSMAC_STS_CS		0x001f0000
+
+#define TSI721_IBDMAC_INT(x)		(0x61248 + (x) * 0x1000)
+#define TSI721_IBDMAC_INTSET(x)		(0x6124c + (x) * 0x1000)
+#define TSI721_IBDMAC_INTE(x)		(0x61250 + (x) * 0x1000)
+#define TSI721_IBDMAC_INT_MASK		0x0000100f
+#define TSI721_IBDMAC_INT_SRTO		0x00001000
+#define TSI721_IBDMAC_INT_SUSPENDED	0x00000008
+#define TSI721_IBDMAC_INT_PC_ERROR	0x00000004
+#define TSI721_IBDMAC_INT_FQ_LOW	0x00000002
+#define TSI721_IBDMAC_INT_DQ_RCV	0x00000001
+#define TSI721_IBDMAC_INT_ALL		TSI721_IBDMAC_INT_MASK
+
+#define TSI721_IBDMAC_PWE(x)		(0x61254 + (x) * 0x1000)
+#define TSI721_IBDMAC_PWE_MASK		0x00001700
+#define TSI721_IBDMAC_PWE_SRTO		0x00001000
+#define TSI721_IBDMAC_PWE_ILL_FMT	0x00000400
+#define TSI721_IBDMAC_PWE_ILL_DEC	0x00000200
+#define TSI721_IBDMAC_PWE_IMP_SP	0x00000100
+
+#define TSI721_IBDMAC_DQBL(x)		(0x61300 + (x) * 0x1000)
+#define TSI721_IBDMAC_DQBL_MASK		0xffffffc0
+#define TSI721_IBDMAC_DQBL_ADDR		0xffffffc0
+
+#define TSI721_IBDMAC_DQBH(x)		(0x61304 + (x) * 0x1000)
+#define TSI721_IBDMAC_DQBH_MASK		0xffffffff
+
+#define TSI721_IBDMAC_DQRP(x)		(0x61308 + (x) * 0x1000)
+#define TSI721_IBDMAC_DQRP_MASK		0x0007ffff
+
+#define TSI721_IBDMAC_DQWR(x)		(0x6130c + (x) * 0x1000)
+#define TSI721_IBDMAC_DQWR_MASK		0x0007ffff
+
+#define TSI721_IBDMAC_DQSZ(x)		(0x61314 + (x) * 0x1000)
+#define TSI721_IBDMAC_DQSZ_MASK		0x0000000f
+
+/*
+ * Messaging Engine Interrupts
+ */
+
+#define TSI721_SMSG_PWE			0x6a004
+
+#define TSI721_SMSG_INTE		0x6a000
+#define TSI721_SMSG_INT			0x6a008
+#define TSI721_SMSG_INTSET		0x6a010
+#define TSI721_SMSG_INT_MASK		0x0086ffff
+#define TSI721_SMSG_INT_UNS_RSP		0x00800000
+#define TSI721_SMSG_INT_ECC_NCOR	0x00040000
+#define TSI721_SMSG_INT_ECC_COR		0x00020000
+#define TSI721_SMSG_INT_ECC_NCOR_CH	0x0000ff00
+#define TSI721_SMSG_INT_ECC_COR_CH	0x000000ff
+
+#define TSI721_SMSG_ECC_LOG		0x6a014
+#define TSI721_SMSG_ECC_LOG_MASK	0x00070007
+#define TSI721_SMSG_ECC_LOG_ECC_NCOR_M	0x00070000
+#define TSI721_SMSG_ECC_LOG_ECC_COR_M	0x00000007
+
+#define TSI721_RETRY_GEN_CNT		0x6a100
+#define TSI721_RETRY_GEN_CNT_MASK	0xffffffff
+
+#define TSI721_RETRY_RX_CNT		0x6a104
+#define TSI721_RETRY_RX_CNT_MASK	0xffffffff
+
+#define TSI721_SMSG_ECC_COR_LOG(x)	(0x6a300 + (x) * 4)
+#define TSI721_SMSG_ECC_COR_LOG_MASK	0x000000ff
+
+#define TSI721_SMSG_ECC_NCOR(x)		(0x6a340 + (x) * 4)
+#define TSI721_SMSG_ECC_NCOR_MASK	0x000000ff
+
+/*
+ * Block DMA Descriptors
+ */
+
+struct tsi721_dma_desc {
+	__le32 type_id;
+
+#define TSI721_DMAD_DEVID	0x0000ffff
+#define TSI721_DMAD_CRF		0x00010000
+#define TSI721_DMAD_PRIO	0x00060000
+#define TSI721_DMAD_RTYPE	0x00780000
+#define TSI721_DMAD_IOF		0x08000000
+#define TSI721_DMAD_DTYPE	0xe0000000
+
+	__le32 bcount;
+
+#define TSI721_DMAD_BCOUNT1	0x03ffffff /* if DTYPE == 1 */
+#define TSI721_DMAD_BCOUNT2	0x0000000f /* if DTYPE == 2 */
+#define TSI721_DMAD_TT		0x0c000000
+#define TSI721_DMAD_RADDR0	0xc0000000
+
+	union {
+		__le32 raddr_lo;	   /* if DTYPE == (1 || 2) */
+		__le32 next_lo;		   /* if DTYPE == 3 */
+	};
+
+#define TSI721_DMAD_CFGOFF	0x00ffffff
+#define TSI721_DMAD_HOPCNT	0xff000000
+
+	union {
+		__le32 raddr_hi;	   /* if DTYPE == (1 || 2) */
+		__le32 next_hi;		   /* if DTYPE == 3 */
+	};
+
+	union {
+		struct {		   /* if DTYPE == 1 */
+			__le32 bufptr_lo;
+			__le32 bufptr_hi;
+			__le32 s_dist;
+			__le32 s_size;
+		} t1;
+		__le32 data[4];		   /* if DTYPE == 2 */
+		u32    reserved[4];	   /* if DTYPE == 3 */
+	};
+} __aligned(32);
+
+/*
+ * Inbound Messaging Descriptor
+ */
+struct tsi721_imsg_desc {
+	__le32 type_id;
+
+#define TSI721_IMD_DEVID	0x0000ffff
+#define TSI721_IMD_CRF		0x00010000
+#define TSI721_IMD_PRIO		0x00060000
+#define TSI721_IMD_TT		0x00180000
+#define TSI721_IMD_DTYPE	0xe0000000
+
+	__le32 msg_info;
+
+#define TSI721_IMD_BCOUNT	0x00000ff8
+#define TSI721_IMD_SSIZE	0x0000f000
+#define TSI721_IMD_LETER	0x00030000
+#define TSI721_IMD_XMBOX	0x003c0000
+#define TSI721_IMD_MBOX		0x00c00000
+#define TSI721_IMD_CS		0x78000000
+#define TSI721_IMD_HO		0x80000000
+
+	__le32 bufptr_lo;
+	__le32 bufptr_hi;
+	u32    reserved[12];
+
+} __aligned(64);
+
+/*
+ * Outbound Messaging Descriptor
+ */
+struct tsi721_omsg_desc {
+	__le32 type_id;
+
+#define TSI721_OMD_DEVID	0x0000ffff
+#define TSI721_OMD_CRF		0x00010000
+#define TSI721_OMD_PRIO		0x00060000
+#define TSI721_OMD_IOF		0x08000000
+#define TSI721_OMD_DTYPE	0xe0000000
+#define TSI721_OMD_RSRVD	0x17f80000
+
+	__le32 msg_info;
+
+#define TSI721_OMD_BCOUNT	0x00000ff8
+#define TSI721_OMD_SSIZE	0x0000f000
+#define TSI721_OMD_LETER	0x00030000
+#define TSI721_OMD_XMBOX	0x003c0000
+#define TSI721_OMD_MBOX		0x00c00000
+#define TSI721_OMD_TT		0x0c000000
+
+	union {
+		__le32 bufptr_lo;	/* if DTYPE == 4 */
+		__le32 next_lo;		/* if DTYPE == 5 */
+	};
+
+	union {
+		__le32 bufptr_hi;	/* if DTYPE == 4 */
+		__le32 next_hi;		/* if DTYPE == 5 */
+	};
+
+} __aligned(16);
+
+struct tsi721_dma_sts {
+	__le64	desc_sts[8];
+} __aligned(64);
+
+struct tsi721_desc_sts_fifo {
+	union {
+		__le64	da64;
+		struct {
+			__le32	lo;
+			__le32	hi;
+		} da32;
+	} stat[8];
+} __aligned(64);
+
+/* Descriptor types for BDMA and Messaging blocks */
+enum dma_dtype {
+	DTYPE1 = 1, /* Data Transfer DMA Descriptor */
+	DTYPE2 = 2, /* Immediate Data Transfer DMA Descriptor */
+	DTYPE3 = 3, /* Block Pointer DMA Descriptor */
+	DTYPE4 = 4, /* Outbound Msg DMA Descriptor */
+	DTYPE5 = 5, /* OB Messaging Block Pointer Descriptor */
+	DTYPE6 = 6  /* Inbound Messaging Descriptor */
+};
+
+enum dma_rtype {
+	NREAD = 0,
+	LAST_NWRITE_R = 1,
+	ALL_NWRITE = 2,
+	ALL_NWRITE_R = 3,
+	MAINT_RD = 4,
+	MAINT_WR = 5
+};
+
+/*
+ * mport Driver Definitions
+ */
+#define TSI721_DMA_CHNUM	TSI721_DMA_MAXCH
+
+#define TSI721_DMACH_MAINT	0	/* DMA channel for maint requests */
+#define TSI721_DMACH_MAINT_NBD	32	/* Number of BDs for maint requests */
+
+#define MSG_DMA_ENTRY_INX_TO_SIZE(x)	((0x10 << (x)) & 0xFFFF0)
+
+enum tsi721_smsg_int_flag {
+	SMSG_INT_NONE		= 0x00000000,
+	SMSG_INT_ECC_COR_CH	= 0x000000ff,
+	SMSG_INT_ECC_NCOR_CH	= 0x0000ff00,
+	SMSG_INT_ECC_COR	= 0x00020000,
+	SMSG_INT_ECC_NCOR	= 0x00040000,
+	SMSG_INT_UNS_RSP	= 0x00800000,
+	SMSG_INT_ALL		= 0x0006ffff
+};
+
+/* Structures */
+
+struct tsi721_bdma_chan {
+	int		bd_num;		/* number of buffer descriptors */
+	void		*bd_base;	/* start of DMA descriptors */
+	dma_addr_t	bd_phys;
+	void		*sts_base;	/* start of DMA BD status FIFO */
+	dma_addr_t	sts_phys;
+	int		sts_size;
+};
+
+struct tsi721_imsg_ring {
+	u32		size;
+	/* VA/PA of data buffers for incoming messages */
+	void		*buf_base;
+	dma_addr_t	buf_phys;
+	/* VA/PA of circular free buffer list */
+	void		*imfq_base;
+	dma_addr_t	imfq_phys;
+	/* VA/PA of Inbound message descriptors */
+	void		*imd_base;
+	dma_addr_t	imd_phys;
+	 /* Inbound Queue buffer pointers */
+	void		*imq_base[TSI721_IMSGD_RING_SIZE];
+
+	u32		rx_slot;
+	void		*dev_id;
+	u32		fq_wrptr;
+	u32		desc_rdptr;
+	spinlock_t	lock;
+};
+
+struct tsi721_omsg_ring {
+	u32		size;
+	/* VA/PA of OB Msg descriptors */
+	void		*omd_base;
+	dma_addr_t	omd_phys;
+	/* VA/PA of OB Msg data buffers */
+	void		*omq_base[TSI721_OMSGD_RING_SIZE];
+	dma_addr_t	omq_phys[TSI721_OMSGD_RING_SIZE];
+	/* VA/PA of OB Msg descriptor status FIFO */
+	void		*sts_base;
+	dma_addr_t	sts_phys;
+	u32		sts_size; /* # of allocated status entries */
+	u32		sts_rdptr;
+
+	u32		tx_slot;
+	void		*dev_id;
+	u32		wr_count;
+	spinlock_t	lock;
+};
+
+enum tsi721_flags {
+	TSI721_USING_MSI	= (1 << 0),
+	TSI721_USING_MSIX	= (1 << 1),
+	TSI721_IMSGID_SET	= (1 << 2),
+};
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * MSI-X Table Entries (0 ... 69)
+ */
+#define TSI721_MSIX_DMACH_DONE(x)	(0 + (x))
+#define TSI721_MSIX_DMACH_INT(x)	(8 + (x))
+#define TSI721_MSIX_BDMA_INT		16
+#define TSI721_MSIX_OMSG_DONE(x)	(17 + (x))
+#define TSI721_MSIX_OMSG_INT(x)		(25 + (x))
+#define TSI721_MSIX_IMSG_DQ_RCV(x)	(33 + (x))
+#define TSI721_MSIX_IMSG_INT(x)		(41 + (x))
+#define TSI721_MSIX_MSG_INT		49
+#define TSI721_MSIX_SR2PC_IDBQ_RCV(x)	(50 + (x))
+#define TSI721_MSIX_SR2PC_CH_INT(x)	(58 + (x))
+#define TSI721_MSIX_SR2PC_INT		66
+#define TSI721_MSIX_PC2SR_INT		67
+#define TSI721_MSIX_SRIO_MAC_INT	68
+#define TSI721_MSIX_I2C_INT		69
+
+/* MSI-X vector and init table entry indexes */
+enum tsi721_msix_vect {
+	TSI721_VECT_IDB,
+	TSI721_VECT_PWRX, /* PW_RX is part of SRIO MAC Interrupt reporting */
+	TSI721_VECT_OMB0_DONE,
+	TSI721_VECT_OMB1_DONE,
+	TSI721_VECT_OMB2_DONE,
+	TSI721_VECT_OMB3_DONE,
+	TSI721_VECT_OMB0_INT,
+	TSI721_VECT_OMB1_INT,
+	TSI721_VECT_OMB2_INT,
+	TSI721_VECT_OMB3_INT,
+	TSI721_VECT_IMB0_RCV,
+	TSI721_VECT_IMB1_RCV,
+	TSI721_VECT_IMB2_RCV,
+	TSI721_VECT_IMB3_RCV,
+	TSI721_VECT_IMB0_INT,
+	TSI721_VECT_IMB1_INT,
+	TSI721_VECT_IMB2_INT,
+	TSI721_VECT_IMB3_INT,
+	TSI721_VECT_MAX
+};
+
+#define IRQ_DEVICE_NAME_MAX	64
+
+struct msix_irq {
+	u16	vector;
+	char	irq_name[IRQ_DEVICE_NAME_MAX];
+};
+#endif /* CONFIG_PCI_MSI */
+
+struct tsi721_device {
+	struct pci_dev	*pdev;
+	struct rio_mport *mport;
+	u32		flags;
+	void __iomem	*regs;
+#ifdef CONFIG_PCI_MSI
+	struct msix_irq	msix[TSI721_VECT_MAX];
+#endif
+	/* Doorbells */
+	void __iomem	*odb_base;
+	void		*idb_base;
+	dma_addr_t	idb_dma;
+	struct work_struct idb_work;
+	u32		db_discard_count;
+
+	/* Inbound Port-Write */
+	struct work_struct pw_work;
+	struct kfifo	pw_fifo;
+	spinlock_t	pw_fifo_lock;
+	u32		pw_discard_count;
+
+	/* BDMA Engine */
+	struct tsi721_bdma_chan bdma[TSI721_DMA_CHNUM];
+
+	/* Inbound Messaging */
+	int		imsg_init[TSI721_IMSG_CHNUM];
+	struct tsi721_imsg_ring imsg_ring[TSI721_IMSG_CHNUM];
+
+	/* Outbound Messaging */
+	int		omsg_init[TSI721_OMSG_CHNUM];
+	struct tsi721_omsg_ring	omsg_ring[TSI721_OMSG_CHNUM];
+};
+
+#endif
diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h
index 0cee015..b66d13d 100644
--- a/include/linux/rio_ids.h
+++ b/include/linux/rio_ids.h
@@ -39,5 +39,6 @@
 #define RIO_DID_IDTCPS1616		0x0379
 #define RIO_DID_IDTVPS1616		0x0377
 #define RIO_DID_IDTSPS1616		0x0378
+#define RIO_DID_TSI721			0x80ab
 
 #endif				/* LINUX_RIO_IDS_H */
-- 
cgit v0.10.2


From 166c050bda80bb5dd627a287b6efcdfb68d172b4 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 2 Nov 2011 13:39:11 -0700
Subject: RapidIO: fix potential null deref in rio_setup_device()

The "goto cleanup" path can deference "rswitch" when it is NULL.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Dan Carpenter <error27@gmail.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Chul Kim <chul.kim@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 0914f49..2bebd79 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -516,7 +516,7 @@ static struct rio_dev __devinit *rio_setup_device(struct rio_net *net,
 	return rdev;
 
 cleanup:
-	if (rio_is_switch(rdev))
+	if (rswitch)
 		kfree(rswitch->route_table);
 
 	kfree(rdev);
-- 
cgit v0.10.2


From e0c87bd95e8dad455c23bc56513af8dcb1737e55 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 2 Nov 2011 13:39:15 -0700
Subject: drivers/net/rionet.c: fix ethernet address macros for LE platforms

Modify Ethernet addess macros to be compatible with BE/LE platforms

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Chul Kim <chul.kim@idt.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: <stable@kernel.org>		[2.6.39+]
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index 3bb1311..7145714 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -88,8 +88,8 @@ static struct rio_dev **rionet_active;
 #define dev_rionet_capable(dev) \
 	is_rionet_capable(dev->src_ops, dev->dst_ops)
 
-#define RIONET_MAC_MATCH(x)	(*(u32 *)x == 0x00010001)
-#define RIONET_GET_DESTID(x)	(*(u16 *)(x + 4))
+#define RIONET_MAC_MATCH(x)	(!memcmp((x), "\00\01\00\01", 4))
+#define RIONET_GET_DESTID(x)	((*((u8 *)x + 4) << 8) | *((u8 *)x + 5))
 
 static int rionet_rx_clean(struct net_device *ndev)
 {
-- 
cgit v0.10.2


From 088024b1deee206cd37eff980138e918837aabdb Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 2 Nov 2011 13:39:19 -0700
Subject: RapidIO: documentation update

Update rapidio.txt to reflect changes from recent patch.
See http://marc.info/?l=linux-kernel&m=131285620113589&w=2 for details.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Liu Gang <Gang.Liu@freescale.com>
Cc: Micha Nelissen <micha@neli.hopto.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt
index be70ee1..c75694b 100644
--- a/Documentation/rapidio/rapidio.txt
+++ b/Documentation/rapidio/rapidio.txt
@@ -144,7 +144,7 @@ and the default device ID in order to access the device on the active port.
 
 After the host has completed enumeration of the entire network it releases
 devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint
-in the system, it sets the Master Enable bit in the Port General Control CSR
+in the system, it sets the Discovered bit in the Port General Control CSR
 to indicate that enumeration is completed and agents are allowed to execute
 passive discovery of the network.
 
-- 
cgit v0.10.2


From f1ecf06854a66ee663f4d4cf029c78cd62a15e04 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Wed, 2 Nov 2011 13:39:22 -0700
Subject: sysctl: add support for poll()

Adding support for poll() in sysctl fs allows userspace to receive
notifications of changes in sysctl entries.  This adds a infrastructure to
allow files in sysctl fs to be pollable and implements it for hostname and
domainname.

[akpm@linux-foundation.org: s/declare/define/ for definitions]
Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Cc: Greg KH <gregkh@suse.de>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dacd840..df59480 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,6 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/namei.h>
@@ -14,6 +15,15 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+void proc_sys_poll_notify(struct ctl_table_poll *poll)
+{
+	if (!poll)
+		return;
+
+	atomic_inc(&poll->event);
+	wake_up_interruptible(&poll->wait);
+}
+
 static struct inode *proc_sys_make_inode(struct super_block *sb,
 		struct ctl_table_header *head, struct ctl_table *table)
 {
@@ -176,6 +186,39 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 	return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
 }
 
+static int proc_sys_open(struct inode *inode, struct file *filp)
+{
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (table->poll)
+		filp->private_data = proc_sys_poll_event(table->poll);
+
+	return 0;
+}
+
+static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	unsigned long event = (unsigned long)filp->private_data;
+	unsigned int ret = DEFAULT_POLLMASK;
+
+	if (!table->proc_handler)
+		goto out;
+
+	if (!table->poll)
+		goto out;
+
+	poll_wait(filp, &table->poll->wait, wait);
+
+	if (event != atomic_read(&table->poll->event)) {
+		filp->private_data = proc_sys_poll_event(table->poll);
+		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+	}
+
+out:
+	return ret;
+}
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
 				filldir_t filldir,
@@ -364,6 +407,8 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 }
 
 static const struct file_operations proc_sys_file_operations = {
+	.open		= proc_sys_open,
+	.poll		= proc_sys_poll,
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
 	.llseek		= default_llseek,
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 9a1ec10..703cfa3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -931,6 +931,7 @@ enum
 #ifdef __KERNEL__
 #include <linux/list.h>
 #include <linux/rcupdate.h>
+#include <linux/wait.h>
 
 /* For the /proc/sys support */
 struct ctl_table;
@@ -1011,6 +1012,26 @@ extern int proc_do_large_bitmap(struct ctl_table *, int,
  * cover common cases.
  */
 
+/* Support for userspace poll() to watch for changes */
+struct ctl_table_poll {
+	atomic_t event;
+	wait_queue_head_t wait;
+};
+
+static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
+{
+	return (void *)(unsigned long)atomic_read(&poll->event);
+}
+
+void proc_sys_poll_notify(struct ctl_table_poll *poll);
+
+#define __CTL_TABLE_POLL_INITIALIZER(name) {				\
+	.event = ATOMIC_INIT(0),					\
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
+
+#define DEFINE_CTL_TABLE_POLL(name)					\
+	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+
 /* A sysctl table is an array of struct ctl_table: */
 struct ctl_table 
 {
@@ -1021,6 +1042,7 @@ struct ctl_table
 	struct ctl_table *child;
 	struct ctl_table *parent;	/* Automatically set */
 	proc_handler *proc_handler;	/* Callback for text formatting */
+	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
 };
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 4e5b021..c714ed7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -37,6 +37,14 @@ struct new_utsname {
 #include <linux/nsproxy.h>
 #include <linux/err.h>
 
+enum uts_proc {
+	UTS_PROC_OSTYPE,
+	UTS_PROC_OSRELEASE,
+	UTS_PROC_VERSION,
+	UTS_PROC_HOSTNAME,
+	UTS_PROC_DOMAINNAME,
+};
+
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
@@ -80,6 +88,14 @@ static inline struct uts_namespace *copy_utsname(unsigned long flags,
 }
 #endif
 
+#ifdef CONFIG_PROC_SYSCTL
+extern void uts_proc_notify(enum uts_proc proc);
+#else
+static inline void uts_proc_notify(enum uts_proc proc)
+{
+}
+#endif
+
 static inline struct new_utsname *utsname(void)
 {
 	return &current->nsproxy->uts_ns->name;
diff --git a/kernel/sys.c b/kernel/sys.c
index 5845950..d06c091 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1286,6 +1286,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_HOSTNAME);
 	up_write(&uts_sem);
 	return errno;
 }
@@ -1336,6 +1337,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
 		errno = 0;
 	}
+	uts_proc_notify(UTS_PROC_DOMAINNAME);
 	up_write(&uts_sem);
 	return errno;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e..3b0d48e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -13,6 +13,7 @@
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
 	uts_table.data = get_uts(table, write);
 	r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
 	put_uts(table, write, uts_table.data);
+
+	if (write)
+		proc_sys_poll_notify(table->poll);
+
 	return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
 
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
+
 static struct ctl_table uts_kern_table[] = {
 	{
 		.procname	= "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &hostname_poll,
 	},
 	{
 		.procname	= "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
 		.mode		= 0644,
 		.proc_handler	= proc_do_uts_string,
+		.poll		= &domainname_poll,
 	},
 	{}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
 	{}
 };
 
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+	struct ctl_table *table = &uts_kern_table[proc];
+
+	proc_sys_poll_notify(table->poll);
+}
+#endif
+
 static int __init utsname_sysctl_init(void)
 {
 	register_sysctl_table(uts_root_table);
-- 
cgit v0.10.2


From c736de60aed869df8a9aba512cdaf89e32545b00 Mon Sep 17 00:00:00 2001
From: WANG Cong <amwang@redhat.com>
Date: Wed, 2 Nov 2011 13:39:25 -0700
Subject: sysctl: make CONFIG_SYSCTL_SYSCALL default to n

When I tried to send a patch to remove it, Andi told me we still need to
keep compabitlies for old libc, so we can't remove this completely.  Then
just make it default to n and remove the doc from
feature-removal-schedule.txt.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7c799fc..3d84912 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -133,41 +133,6 @@ Who:	Pavel Machek <pavel@ucw.cz>
 
 ---------------------------
 
-What:	sys_sysctl
-When:	September 2010
-Option: CONFIG_SYSCTL_SYSCALL
-Why:	The same information is available in a more convenient from
-	/proc/sys, and none of the sysctl variables appear to be
-	important performance wise.
-
-	Binary sysctls are a long standing source of subtle kernel
-	bugs and security issues.
-
-	When I looked several months ago all I could find after
-	searching several distributions were 5 user space programs and
-	glibc (which falls back to /proc/sys) using this syscall.
-
-	The man page for sysctl(2) documents it as unusable for user
-	space programs.
-
-	sysctl(2) is not generally ABI compatible to a 32bit user
-	space application on a 64bit and a 32bit kernel.
-
-	For the last several months the policy has been no new binary
-	sysctls and no one has put forward an argument to use them.
-
-	Binary sysctls issues seem to keep happening appearing so
-	properly deprecating them (with a warning to user space) and a
-	2 year grace warning period will mean eventually we can kill
-	them and end the pain.
-
-	In the mean time individual binary sysctls can be dealt with
-	in a piecewise fashion.
-
-Who:	Eric Biederman <ebiederm@xmission.com>
-
----------------------------
-
 What:	/proc/<pid>/oom_adj
 When:	August 2012
 Why:	/proc/<pid>/oom_adj allows userspace to influence the oom killer's
diff --git a/init/Kconfig b/init/Kconfig
index 31ba0fd..43298f9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,7 +947,7 @@ config UID16
 config SYSCTL_SYSCALL
 	bool "Sysctl syscall support" if EXPERT
 	depends on PROC_SYSCTL
-	default y
+	default n
 	select SYSCTL
 	---help---
 	  sys_sysctl uses binary paths that have been found challenging
@@ -959,7 +959,7 @@ config SYSCTL_SYSCALL
 	  trying to save some space it is probably safe to disable this,
 	  making your kernel marginally smaller.
 
-	  If unsure say Y here.
+	  If unsure say N here.
 
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EXPERT
-- 
cgit v0.10.2


From 842fa69f3e0c9a178b294e7af7c07f4c9d9e7af2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 2 Nov 2011 13:39:33 -0700
Subject: include/linux/dma-mapping.h: add dma_zalloc_coherent()

Lots of driver code does a dma_alloc_coherent() and then zeroes out the
memory with a memset.  Make it easy for them.

Cc: Alexandre Bounine <alexandre.bounine@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index fe23269..66bd97a 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -50,6 +50,13 @@ specify the GFP_ flags (see kmalloc) for the allocation (the
 implementation may choose to ignore flags that affect the location of
 the returned memory, like GFP_DMA).
 
+void *
+dma_zalloc_coherent(struct device *dev, size_t size,
+			     dma_addr_t *dma_handle, gfp_t flag)
+
+Wraps dma_alloc_coherent() and also zeroes the returned memory if the
+allocation attempt succeeded.
+
 void
 dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
 			   dma_addr_t dma_handle)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 347fdc3..be86ae1 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_DMA_MAPPING_H
 #define _LINUX_DMA_MAPPING_H
 
+#include <linux/string.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/dma-attrs.h>
@@ -117,6 +118,15 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask)
 		return -EIO;
 }
 
+static inline void *dma_zalloc_coherent(struct device *dev, size_t size,
+					dma_addr_t *dma_handle, gfp_t flag)
+{
+	void *ret = dma_alloc_coherent(dev, size, dma_handle, flag);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
+}
+
 #ifdef CONFIG_HAS_DMA
 static inline int dma_get_cache_alignment(void)
 {
-- 
cgit v0.10.2


From 437c53418616973071fd2d7c87497780944d8fdb Mon Sep 17 00:00:00 2001
From: James Nuss <jamesnuss@nanometrics.ca>
Date: Wed, 2 Nov 2011 13:39:34 -0700
Subject: pps: default echo function

A default echo function has been provided so it is no longer an error when
you specify PPS_ECHOASSERT or PPS_ECHOCLEAR without an explicit echo
function.  This allows some code re-use and also makes it easier to write
client drivers since the default echo function does not normally need to
change.

Signed-off-by: James Nuss <jamesnuss@nanometrics.ca>
Reviewed-by: Ben Gardiner <bengardiner@nanometrics.ca>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Cc: Ricardo Martins <rasm@fe.up.pt>
Cc: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Cc: Igor Plyatov <plyatov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c
index 82583b0..436b4e4 100644
--- a/drivers/pps/clients/pps-ktimer.c
+++ b/drivers/pps/clients/pps-ktimer.c
@@ -52,17 +52,6 @@ static void pps_ktimer_event(unsigned long ptr)
 }
 
 /*
- * The echo function
- */
-
-static void pps_ktimer_echo(struct pps_device *pps, int event, void *data)
-{
-	dev_info(pps->dev, "echo %s %s\n",
-		event & PPS_CAPTUREASSERT ? "assert" : "",
-		event & PPS_CAPTURECLEAR ? "clear" : "");
-}
-
-/*
  * The PPS info struct
  */
 
@@ -72,7 +61,6 @@ static struct pps_source_info pps_ktimer_info = {
 	.mode		= PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
 			  PPS_ECHOASSERT |
 			  PPS_CANWAIT | PPS_TSFMT_TSPEC,
-	.echo		= pps_ktimer_echo,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c
index c571d6d..e1b4705 100644
--- a/drivers/pps/clients/pps_parport.c
+++ b/drivers/pps/clients/pps_parport.c
@@ -133,14 +133,6 @@ out_both:
 	return;
 }
 
-/* the PPS echo function */
-static void pps_echo(struct pps_device *pps, int event, void *data)
-{
-	dev_info(pps->dev, "echo %s %s\n",
-		event & PPS_CAPTUREASSERT ? "assert" : "",
-		event & PPS_CAPTURECLEAR ? "clear" : "");
-}
-
 static void parport_attach(struct parport *port)
 {
 	struct pps_client_pp *device;
@@ -151,7 +143,6 @@ static void parport_attach(struct parport *port)
 				  PPS_OFFSETASSERT | PPS_OFFSETCLEAR | \
 				  PPS_ECHOASSERT | PPS_ECHOCLEAR | \
 				  PPS_CANWAIT | PPS_TSFMT_TSPEC,
-		.echo		= pps_echo,
 		.owner		= THIS_MODULE,
 		.dev		= NULL
 	};
diff --git a/drivers/pps/kapi.c b/drivers/pps/kapi.c
index a4e8eb9..f197e8e 100644
--- a/drivers/pps/kapi.c
+++ b/drivers/pps/kapi.c
@@ -52,6 +52,14 @@ static void pps_add_offset(struct pps_ktime *ts, struct pps_ktime *offset)
 	ts->sec += offset->sec;
 }
 
+static void pps_echo_client_default(struct pps_device *pps, int event,
+		void *data)
+{
+	dev_info(pps->dev, "echo %s %s\n",
+		event & PPS_CAPTUREASSERT ? "assert" : "",
+		event & PPS_CAPTURECLEAR ? "clear" : "");
+}
+
 /*
  * Exported functions
  */
@@ -80,13 +88,6 @@ struct pps_device *pps_register_source(struct pps_source_info *info,
 		err = -EINVAL;
 		goto pps_register_source_exit;
 	}
-	if ((info->mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) != 0 &&
-			info->echo == NULL) {
-		pr_err("%s: echo function is not defined\n",
-					info->name);
-		err = -EINVAL;
-		goto pps_register_source_exit;
-	}
 	if ((info->mode & (PPS_TSFMT_TSPEC | PPS_TSFMT_NTPFP)) == 0) {
 		pr_err("%s: unspecified time format\n",
 					info->name);
@@ -108,6 +109,11 @@ struct pps_device *pps_register_source(struct pps_source_info *info,
 	pps->params.mode = default_params;
 	pps->info = *info;
 
+	/* check for default echo function */
+	if ((pps->info.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) &&
+			pps->info.echo == NULL)
+		pps->info.echo = pps_echo_client_default;
+
 	init_waitqueue_head(&pps->queue);
 	spin_lock_init(&pps->lock);
 
-- 
cgit v0.10.2


From 161520451dfacd0eb79d501933f47d3fb7464938 Mon Sep 17 00:00:00 2001
From: James Nuss <jamesnuss@nanometrics.ca>
Date: Wed, 2 Nov 2011 13:39:38 -0700
Subject: pps: new client driver using GPIO

This client driver allows you to use a GPIO pin as a source for PPS
signals.  Platform data [1] are used to specify the GPIO pin number,
label, assert event edge type, and whether clear events are captured.

This driver is based on the work by Ricardo Martins who submitted an
initial implementation [2] of a PPS IRQ client driver to the linuxpps
mailing-list on Dec 3 2010.

[1] include/linux/pps-gpio.h
[2] http://ml.enneenne.com/pipermail/linuxpps/2010-December/004155.html

[akpm@linux-foundation.org: remove unneeded cast of void*]
Signed-off-by: James Nuss <jamesnuss@nanometrics.ca>
Cc: Ricardo Martins <rasm@fe.up.pt>
Acked-by: Rodolfo Giometti <giometti@linux.it>
Signed-off-by: Ricardo Martins <rasm@fe.up.pt>
Cc: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Cc: Igor Plyatov <plyatov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig
index 8520a7f..c2e0f1e 100644
--- a/drivers/pps/clients/Kconfig
+++ b/drivers/pps/clients/Kconfig
@@ -29,4 +29,13 @@ config PPS_CLIENT_PARPORT
 	  If you say yes here you get support for a PPS source connected
 	  with the interrupt pin of your parallel port.
 
+config PPS_CLIENT_GPIO
+	tristate "PPS client using GPIO"
+	depends on PPS
+	help
+	  If you say yes here you get support for a PPS source using
+	  GPIO. To be useful you must also register a platform device
+	  specifying the GPIO pin and other options, usually in your board
+	  setup.
+
 endif
diff --git a/drivers/pps/clients/Makefile b/drivers/pps/clients/Makefile
index 4feb7e9..a461d15 100644
--- a/drivers/pps/clients/Makefile
+++ b/drivers/pps/clients/Makefile
@@ -5,5 +5,6 @@
 obj-$(CONFIG_PPS_CLIENT_KTIMER)	+= pps-ktimer.o
 obj-$(CONFIG_PPS_CLIENT_LDISC)	+= pps-ldisc.o
 obj-$(CONFIG_PPS_CLIENT_PARPORT) += pps_parport.o
+obj-$(CONFIG_PPS_CLIENT_GPIO)	+= pps-gpio.o
 
 ccflags-$(CONFIG_PPS_DEBUG) := -DDEBUG
diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
new file mode 100644
index 0000000..6550555
--- /dev/null
+++ b/drivers/pps/clients/pps-gpio.c
@@ -0,0 +1,227 @@
+/*
+ * pps-gpio.c -- PPS client driver using GPIO
+ *
+ *
+ * Copyright (C) 2010 Ricardo Martins <rasm@fe.up.pt>
+ * Copyright (C) 2011 James Nuss <jamesnuss@nanometrics.ca>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define PPS_GPIO_NAME "pps-gpio"
+#define pr_fmt(fmt) PPS_GPIO_NAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/pps_kernel.h>
+#include <linux/pps-gpio.h>
+#include <linux/gpio.h>
+#include <linux/list.h>
+
+/* Info for each registered platform device */
+struct pps_gpio_device_data {
+	int irq;			/* IRQ used as PPS source */
+	struct pps_device *pps;		/* PPS source device */
+	struct pps_source_info info;	/* PPS source information */
+	const struct pps_gpio_platform_data *pdata;
+};
+
+/*
+ * Report the PPS event
+ */
+
+static irqreturn_t pps_gpio_irq_handler(int irq, void *data)
+{
+	const struct pps_gpio_device_data *info;
+	struct pps_event_time ts;
+	int rising_edge;
+
+	/* Get the time stamp first */
+	pps_get_ts(&ts);
+
+	info = data;
+
+	rising_edge = gpio_get_value(info->pdata->gpio_pin);
+	if ((rising_edge && !info->pdata->assert_falling_edge) ||
+			(!rising_edge && info->pdata->assert_falling_edge))
+		pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL);
+	else if (info->pdata->capture_clear &&
+			((rising_edge && info->pdata->assert_falling_edge) ||
+			 (!rising_edge && !info->pdata->assert_falling_edge)))
+		pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL);
+
+	return IRQ_HANDLED;
+}
+
+static int pps_gpio_setup(struct platform_device *pdev)
+{
+	int ret;
+	const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	ret = gpio_request(pdata->gpio_pin, pdata->gpio_label);
+	if (ret) {
+		pr_warning("failed to request GPIO %u\n", pdata->gpio_pin);
+		return -EINVAL;
+	}
+
+	ret = gpio_direction_input(pdata->gpio_pin);
+	if (ret) {
+		pr_warning("failed to set pin direction\n");
+		gpio_free(pdata->gpio_pin);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned long
+get_irqf_trigger_flags(const struct pps_gpio_platform_data *pdata)
+{
+	unsigned long flags = pdata->assert_falling_edge ?
+		IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING;
+
+	if (pdata->capture_clear) {
+		flags |= ((flags & IRQF_TRIGGER_RISING) ?
+				IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING);
+	}
+
+	return flags;
+}
+
+static int pps_gpio_probe(struct platform_device *pdev)
+{
+	struct pps_gpio_device_data *data;
+	int irq;
+	int ret;
+	int err;
+	int pps_default_params;
+	const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+
+	/* GPIO setup */
+	ret = pps_gpio_setup(pdev);
+	if (ret)
+		return -EINVAL;
+
+	/* IRQ setup */
+	irq = gpio_to_irq(pdata->gpio_pin);
+	if (irq < 0) {
+		pr_err("failed to map GPIO to IRQ: %d\n", irq);
+		err = -EINVAL;
+		goto return_error;
+	}
+
+	/* allocate space for device info */
+	data = kzalloc(sizeof(struct pps_gpio_device_data), GFP_KERNEL);
+	if (data == NULL) {
+		err = -ENOMEM;
+		goto return_error;
+	}
+
+	/* initialize PPS specific parts of the bookkeeping data structure. */
+	data->info.mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
+		PPS_ECHOASSERT | PPS_CANWAIT | PPS_TSFMT_TSPEC;
+	if (pdata->capture_clear)
+		data->info.mode |= PPS_CAPTURECLEAR | PPS_OFFSETCLEAR |
+			PPS_ECHOCLEAR;
+	data->info.owner = THIS_MODULE;
+	snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d",
+		 pdev->name, pdev->id);
+
+	/* register PPS source */
+	pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT;
+	if (pdata->capture_clear)
+		pps_default_params |= PPS_CAPTURECLEAR | PPS_OFFSETCLEAR;
+	data->pps = pps_register_source(&data->info, pps_default_params);
+	if (data->pps == NULL) {
+		kfree(data);
+		pr_err("failed to register IRQ %d as PPS source\n", irq);
+		err = -EINVAL;
+		goto return_error;
+	}
+
+	data->irq = irq;
+	data->pdata = pdata;
+
+	/* register IRQ interrupt handler */
+	ret = request_irq(irq, pps_gpio_irq_handler,
+			get_irqf_trigger_flags(pdata), data->info.name, data);
+	if (ret) {
+		pps_unregister_source(data->pps);
+		kfree(data);
+		pr_err("failed to acquire IRQ %d\n", irq);
+		err = -EINVAL;
+		goto return_error;
+	}
+
+	platform_set_drvdata(pdev, data);
+	dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n", irq);
+
+	return 0;
+
+return_error:
+	gpio_free(pdata->gpio_pin);
+	return err;
+}
+
+static int pps_gpio_remove(struct platform_device *pdev)
+{
+	struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
+	const struct pps_gpio_platform_data *pdata = data->pdata;
+
+	platform_set_drvdata(pdev, NULL);
+	free_irq(data->irq, data);
+	gpio_free(pdata->gpio_pin);
+	pps_unregister_source(data->pps);
+	pr_info("removed IRQ %d as PPS source\n", data->irq);
+	kfree(data);
+	return 0;
+}
+
+static struct platform_driver pps_gpio_driver = {
+	.probe		= pps_gpio_probe,
+	.remove		=  __devexit_p(pps_gpio_remove),
+	.driver		= {
+		.name	= PPS_GPIO_NAME,
+		.owner	= THIS_MODULE
+	},
+};
+
+static int __init pps_gpio_init(void)
+{
+	int ret = platform_driver_register(&pps_gpio_driver);
+	if (ret < 0)
+		pr_err("failed to register platform driver\n");
+	return ret;
+}
+
+static void __exit pps_gpio_exit(void)
+{
+	platform_driver_unregister(&pps_gpio_driver);
+	pr_debug("unregistered platform driver\n");
+}
+
+module_init(pps_gpio_init);
+module_exit(pps_gpio_exit);
+
+MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>");
+MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>");
+MODULE_DESCRIPTION("Use GPIO pin as PPS source");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0.0");
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
new file mode 100644
index 0000000..0035abe
--- /dev/null
+++ b/include/linux/pps-gpio.h
@@ -0,0 +1,32 @@
+/*
+ * pps-gpio.h -- PPS client for GPIOs
+ *
+ *
+ * Copyright (C) 2011 James Nuss <jamesnuss@nanometrics.ca>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _PPS_GPIO_H
+#define _PPS_GPIO_H
+
+struct pps_gpio_platform_data {
+	bool assert_falling_edge;
+	bool capture_clear;
+	unsigned int gpio_pin;
+	const char *gpio_label;
+};
+
+#endif
-- 
cgit v0.10.2


From 79bc57463be2ad5020a53accbf26898e8ac04550 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 2 Nov 2011 13:39:41 -0700
Subject: pps gpio client: add missing dependency

Add "depends on GENERIC_HARDIRQS" to avoid compile breakage on s390:

drivers/built-in.o: In function `pps_gpio_remove':
linux-next/drivers/pps/clients/pps-gpio.c:189: undefined reference to `free_irq'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: James Nuss <jamesnuss@nanometrics.ca>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig
index c2e0f1e..445197d 100644
--- a/drivers/pps/clients/Kconfig
+++ b/drivers/pps/clients/Kconfig
@@ -31,7 +31,7 @@ config PPS_CLIENT_PARPORT
 
 config PPS_CLIENT_GPIO
 	tristate "PPS client using GPIO"
-	depends on PPS
+	depends on PPS && GENERIC_HARDIRQS
 	help
 	  If you say yes here you get support for a PPS source using
 	  GPIO. To be useful you must also register a platform device
-- 
cgit v0.10.2


From 3e5428177c74df7f3b8c59b2f27f46b82b077e94 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@cam.ac.uk>
Date: Wed, 2 Nov 2011 13:39:43 -0700
Subject: w1: ds2760 and ds2780, use ida for id and ida_simple_get() to get it

Straightforward.  As an aside, the ida_init calls are not needed as far as
I can see needed.  (DEFINE_IDA does the same already).

Signed-off-by: Jonathan Cameron <jic23@cam.ac.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Acked-by: Clifton Barnes <cabarnes@indesign-llc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c
index 483d451..5754c9a 100644
--- a/drivers/w1/slaves/w1_ds2760.c
+++ b/drivers/w1/slaves/w1_ds2760.c
@@ -114,43 +114,7 @@ static struct bin_attribute w1_ds2760_bin_attr = {
 	.read = w1_ds2760_read_bin,
 };
 
-static DEFINE_IDR(bat_idr);
-static DEFINE_MUTEX(bat_idr_lock);
-
-static int new_bat_id(void)
-{
-	int ret;
-
-	while (1) {
-		int id;
-
-		ret = idr_pre_get(&bat_idr, GFP_KERNEL);
-		if (ret == 0)
-			return -ENOMEM;
-
-		mutex_lock(&bat_idr_lock);
-		ret = idr_get_new(&bat_idr, NULL, &id);
-		mutex_unlock(&bat_idr_lock);
-
-		if (ret == 0) {
-			ret = id & MAX_ID_MASK;
-			break;
-		} else if (ret == -EAGAIN) {
-			continue;
-		} else {
-			break;
-		}
-	}
-
-	return ret;
-}
-
-static void release_bat_id(int id)
-{
-	mutex_lock(&bat_idr_lock);
-	idr_remove(&bat_idr, id);
-	mutex_unlock(&bat_idr_lock);
-}
+static DEFINE_IDA(bat_ida);
 
 static int w1_ds2760_add_slave(struct w1_slave *sl)
 {
@@ -158,7 +122,7 @@ static int w1_ds2760_add_slave(struct w1_slave *sl)
 	int id;
 	struct platform_device *pdev;
 
-	id = new_bat_id();
+	id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL);
 	if (id < 0) {
 		ret = id;
 		goto noid;
@@ -187,7 +151,7 @@ bin_attr_failed:
 pdev_add_failed:
 	platform_device_unregister(pdev);
 pdev_alloc_failed:
-	release_bat_id(id);
+	ida_simple_remove(&bat_ida, id);
 noid:
 success:
 	return ret;
@@ -199,7 +163,7 @@ static void w1_ds2760_remove_slave(struct w1_slave *sl)
 	int id = pdev->id;
 
 	platform_device_unregister(pdev);
-	release_bat_id(id);
+	ida_simple_remove(&bat_ida, id);
 	sysfs_remove_bin_file(&sl->dev.kobj, &w1_ds2760_bin_attr);
 }
 
@@ -217,14 +181,14 @@ static int __init w1_ds2760_init(void)
 {
 	printk(KERN_INFO "1-Wire driver for the DS2760 battery monitor "
 	       " chip  - (c) 2004-2005, Szabolcs Gyurko\n");
-	idr_init(&bat_idr);
+	ida_init(&bat_ida);
 	return w1_register_family(&w1_ds2760_family);
 }
 
 static void __exit w1_ds2760_exit(void)
 {
 	w1_unregister_family(&w1_ds2760_family);
-	idr_destroy(&bat_idr);
+	ida_destroy(&bat_ida);
 }
 
 EXPORT_SYMBOL(w1_ds2760_read);
diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c
index 274c8f3..a134b38 100644
--- a/drivers/w1/slaves/w1_ds2780.c
+++ b/drivers/w1/slaves/w1_ds2780.c
@@ -99,43 +99,7 @@ static struct bin_attribute w1_ds2780_bin_attr = {
 	.read = w1_ds2780_read_bin,
 };
 
-static DEFINE_IDR(bat_idr);
-static DEFINE_MUTEX(bat_idr_lock);
-
-static int new_bat_id(void)
-{
-	int ret;
-
-	while (1) {
-		int id;
-
-		ret = idr_pre_get(&bat_idr, GFP_KERNEL);
-		if (ret == 0)
-			return -ENOMEM;
-
-		mutex_lock(&bat_idr_lock);
-		ret = idr_get_new(&bat_idr, NULL, &id);
-		mutex_unlock(&bat_idr_lock);
-
-		if (ret == 0) {
-			ret = id & MAX_ID_MASK;
-			break;
-		} else if (ret == -EAGAIN) {
-			continue;
-		} else {
-			break;
-		}
-	}
-
-	return ret;
-}
-
-static void release_bat_id(int id)
-{
-	mutex_lock(&bat_idr_lock);
-	idr_remove(&bat_idr, id);
-	mutex_unlock(&bat_idr_lock);
-}
+static DEFINE_IDA(bat_ida);
 
 static int w1_ds2780_add_slave(struct w1_slave *sl)
 {
@@ -143,7 +107,7 @@ static int w1_ds2780_add_slave(struct w1_slave *sl)
 	int id;
 	struct platform_device *pdev;
 
-	id = new_bat_id();
+	id = ida_simple_get(&bat_ida, 0, 0, GFP_KERNEL);
 	if (id < 0) {
 		ret = id;
 		goto noid;
@@ -172,7 +136,7 @@ bin_attr_failed:
 pdev_add_failed:
 	platform_device_unregister(pdev);
 pdev_alloc_failed:
-	release_bat_id(id);
+	ida_simple_remove(&bat_ida, id);
 noid:
 	return ret;
 }
@@ -183,7 +147,7 @@ static void w1_ds2780_remove_slave(struct w1_slave *sl)
 	int id = pdev->id;
 
 	platform_device_unregister(pdev);
-	release_bat_id(id);
+	ida_simple_remove(&bat_ida, id);
 	sysfs_remove_bin_file(&sl->dev.kobj, &w1_ds2780_bin_attr);
 }
 
@@ -199,14 +163,14 @@ static struct w1_family w1_ds2780_family = {
 
 static int __init w1_ds2780_init(void)
 {
-	idr_init(&bat_idr);
+	ida_init(&bat_ida);
 	return w1_register_family(&w1_ds2780_family);
 }
 
 static void __exit w1_ds2780_exit(void)
 {
 	w1_unregister_family(&w1_ds2780_family);
-	idr_destroy(&bat_idr);
+	ida_destroy(&bat_ida);
 }
 
 module_init(w1_ds2780_init);
-- 
cgit v0.10.2


From 853eee72f74f449797f0500ea19fc1bf497428d8 Mon Sep 17 00:00:00 2001
From: Clifton Barnes <cabarnes@indesign-llc.com>
Date: Wed, 2 Nov 2011 13:39:50 -0700
Subject: drivers/power/ds2780_battery.c: create central point for calling w1
 interface

Simply creates one point to call the w1 interface.

Signed-off-by: Clifton Barnes <cabarnes@indesign-llc.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/power/ds2780_battery.c b/drivers/power/ds2780_battery.c
index 1fefe82..2571b41 100644
--- a/drivers/power/ds2780_battery.c
+++ b/drivers/power/ds2780_battery.c
@@ -49,8 +49,8 @@ enum current_types {
 static const char model[] = "DS2780";
 static const char manufacturer[] = "Maxim/Dallas";
 
-static inline struct ds2780_device_info *to_ds2780_device_info(
-	struct power_supply *psy)
+static inline struct ds2780_device_info *
+to_ds2780_device_info(struct power_supply *psy)
 {
 	return container_of(psy, struct ds2780_device_info, bat);
 }
@@ -60,17 +60,25 @@ static inline struct power_supply *to_power_supply(struct device *dev)
 	return dev_get_drvdata(dev);
 }
 
-static inline int ds2780_read8(struct device *dev, u8 *val, int addr)
+static inline int ds2780_battery_io(struct ds2780_device_info *dev_info,
+	char *buf, int addr, size_t count, int io)
 {
-	return w1_ds2780_io(dev, val, addr, sizeof(u8), 0);
+		return w1_ds2780_io(dev_info->w1_dev, buf, addr, count, io);
 }
 
-static int ds2780_read16(struct device *dev, s16 *val, int addr)
+static inline int ds2780_read8(struct ds2780_device_info *dev_info, u8 *val,
+	int addr)
+{
+	return ds2780_battery_io(dev_info, val, addr, sizeof(u8), 0);
+}
+
+static int ds2780_read16(struct ds2780_device_info *dev_info, s16 *val,
+	int addr)
 {
 	int ret;
 	u8 raw[2];
 
-	ret = w1_ds2780_io(dev, raw, addr, sizeof(u8) * 2, 0);
+	ret = ds2780_battery_io(dev_info, raw, addr, sizeof(raw), 0);
 	if (ret < 0)
 		return ret;
 
@@ -79,16 +87,16 @@ static int ds2780_read16(struct device *dev, s16 *val, int addr)
 	return 0;
 }
 
-static inline int ds2780_read_block(struct device *dev, u8 *val, int addr,
-	size_t count)
+static inline int ds2780_read_block(struct ds2780_device_info *dev_info,
+	u8 *val, int addr, size_t count)
 {
-	return w1_ds2780_io(dev, val, addr, count, 0);
+	return ds2780_battery_io(dev_info, val, addr, count, 0);
 }
 
-static inline int ds2780_write(struct device *dev, u8 *val, int addr,
-	size_t count)
+static inline int ds2780_write(struct ds2780_device_info *dev_info, u8 *val,
+	int addr, size_t count)
 {
-	return w1_ds2780_io(dev, val, addr, count, 1);
+	return ds2780_battery_io(dev_info, val, addr, count, 1);
 }
 
 static inline int ds2780_store_eeprom(struct device *dev, int addr)
@@ -122,7 +130,7 @@ static int ds2780_set_sense_register(struct ds2780_device_info *dev_info,
 {
 	int ret;
 
-	ret = ds2780_write(dev_info->w1_dev, &conductance,
+	ret = ds2780_write(dev_info, &conductance,
 				DS2780_RSNSP_REG, sizeof(u8));
 	if (ret < 0)
 		return ret;
@@ -134,7 +142,7 @@ static int ds2780_set_sense_register(struct ds2780_device_info *dev_info,
 static int ds2780_get_rsgain_register(struct ds2780_device_info *dev_info,
 	u16 *rsgain)
 {
-	return ds2780_read16(dev_info->w1_dev, rsgain, DS2780_RSGAIN_MSB_REG);
+	return ds2780_read16(dev_info, rsgain, DS2780_RSGAIN_MSB_REG);
 }
 
 /* Set RSGAIN value from 0 to 1.999 in steps of 0.001 */
@@ -144,8 +152,8 @@ static int ds2780_set_rsgain_register(struct ds2780_device_info *dev_info,
 	int ret;
 	u8 raw[] = {rsgain >> 8, rsgain & 0xFF};
 
-	ret = ds2780_write(dev_info->w1_dev, raw,
-				DS2780_RSGAIN_MSB_REG, sizeof(u8) * 2);
+	ret = ds2780_write(dev_info, raw,
+				DS2780_RSGAIN_MSB_REG, sizeof(raw));
 	if (ret < 0)
 		return ret;
 
@@ -167,7 +175,7 @@ static int ds2780_get_voltage(struct ds2780_device_info *dev_info,
 	 * Bits 2 - 0 of the voltage value are in bits 7 - 5 of the
 	 * voltage LSB register
 	 */
-	ret = ds2780_read16(dev_info->w1_dev, &voltage_raw,
+	ret = ds2780_read16(dev_info, &voltage_raw,
 				DS2780_VOLT_MSB_REG);
 	if (ret < 0)
 		return ret;
@@ -196,7 +204,7 @@ static int ds2780_get_temperature(struct ds2780_device_info *dev_info,
 	 * Bits 2 - 0 of the temperature value are in bits 7 - 5 of the
 	 * temperature LSB register
 	 */
-	ret = ds2780_read16(dev_info->w1_dev, &temperature_raw,
+	ret = ds2780_read16(dev_info, &temperature_raw,
 				DS2780_TEMP_MSB_REG);
 	if (ret < 0)
 		return ret;
@@ -222,13 +230,13 @@ static int ds2780_get_current(struct ds2780_device_info *dev_info,
 	 * The units of measurement for current are dependent on the value of
 	 * the sense resistor.
 	 */
-	ret = ds2780_read8(dev_info->w1_dev, &sense_res_raw, DS2780_RSNSP_REG);
+	ret = ds2780_read8(dev_info, &sense_res_raw, DS2780_RSNSP_REG);
 	if (ret < 0)
 		return ret;
 
 	if (sense_res_raw == 0) {
 		dev_err(dev_info->dev, "sense resistor value is 0\n");
-		return -ENXIO;
+		return -EINVAL;
 	}
 	sense_res = 1000 / sense_res_raw;
 
@@ -248,7 +256,7 @@ static int ds2780_get_current(struct ds2780_device_info *dev_info,
 	 * Bits 7 - 0 of the current value are in bits 7 - 0 of the current
 	 * LSB register
 	 */
-	ret = ds2780_read16(dev_info->w1_dev, &current_raw, reg_msb);
+	ret = ds2780_read16(dev_info, &current_raw, reg_msb);
 	if (ret < 0)
 		return ret;
 
@@ -267,7 +275,7 @@ static int ds2780_get_accumulated_current(struct ds2780_device_info *dev_info,
 	 * The units of measurement for accumulated current are dependent on
 	 * the value of the sense resistor.
 	 */
-	ret = ds2780_read8(dev_info->w1_dev, &sense_res_raw, DS2780_RSNSP_REG);
+	ret = ds2780_read8(dev_info, &sense_res_raw, DS2780_RSNSP_REG);
 	if (ret < 0)
 		return ret;
 
@@ -285,7 +293,7 @@ static int ds2780_get_accumulated_current(struct ds2780_device_info *dev_info,
 	 * Bits 7 - 0 of the ACR value are in bits 7 - 0 of the ACR
 	 * LSB register
 	 */
-	ret = ds2780_read16(dev_info->w1_dev, &current_raw, DS2780_ACR_MSB_REG);
+	ret = ds2780_read16(dev_info, &current_raw, DS2780_ACR_MSB_REG);
 	if (ret < 0)
 		return ret;
 
@@ -299,7 +307,7 @@ static int ds2780_get_capacity(struct ds2780_device_info *dev_info,
 	int ret;
 	u8 raw;
 
-	ret = ds2780_read8(dev_info->w1_dev, &raw, DS2780_RARC_REG);
+	ret = ds2780_read8(dev_info, &raw, DS2780_RARC_REG);
 	if (ret < 0)
 		return ret;
 
@@ -345,7 +353,7 @@ static int ds2780_get_charge_now(struct ds2780_device_info *dev_info,
 	 * Bits 7 - 0 of the RAAC value are in bits 7 - 0 of the RAAC
 	 * LSB register
 	 */
-	ret = ds2780_read16(dev_info->w1_dev, &charge_raw, DS2780_RAAC_MSB_REG);
+	ret = ds2780_read16(dev_info, &charge_raw, DS2780_RAAC_MSB_REG);
 	if (ret < 0)
 		return ret;
 
@@ -356,7 +364,7 @@ static int ds2780_get_charge_now(struct ds2780_device_info *dev_info,
 static int ds2780_get_control_register(struct ds2780_device_info *dev_info,
 	u8 *control_reg)
 {
-	return ds2780_read8(dev_info->w1_dev, control_reg, DS2780_CONTROL_REG);
+	return ds2780_read8(dev_info, control_reg, DS2780_CONTROL_REG);
 }
 
 static int ds2780_set_control_register(struct ds2780_device_info *dev_info,
@@ -364,7 +372,7 @@ static int ds2780_set_control_register(struct ds2780_device_info *dev_info,
 {
 	int ret;
 
-	ret = ds2780_write(dev_info->w1_dev, &control_reg,
+	ret = ds2780_write(dev_info, &control_reg,
 				DS2780_CONTROL_REG, sizeof(u8));
 	if (ret < 0)
 		return ret;
@@ -503,7 +511,7 @@ static ssize_t ds2780_get_sense_resistor_value(struct device *dev,
 	struct power_supply *psy = to_power_supply(dev);
 	struct ds2780_device_info *dev_info = to_ds2780_device_info(psy);
 
-	ret = ds2780_read8(dev_info->w1_dev, &sense_resistor, DS2780_RSNSP_REG);
+	ret = ds2780_read8(dev_info, &sense_resistor, DS2780_RSNSP_REG);
 	if (ret < 0)
 		return ret;
 
@@ -584,7 +592,7 @@ static ssize_t ds2780_get_pio_pin(struct device *dev,
 	struct power_supply *psy = to_power_supply(dev);
 	struct ds2780_device_info *dev_info = to_ds2780_device_info(psy);
 
-	ret = ds2780_read8(dev_info->w1_dev, &sfr, DS2780_SFR_REG);
+	ret = ds2780_read8(dev_info, &sfr, DS2780_SFR_REG);
 	if (ret < 0)
 		return ret;
 
@@ -611,7 +619,7 @@ static ssize_t ds2780_set_pio_pin(struct device *dev,
 		return -EINVAL;
 	}
 
-	ret = ds2780_write(dev_info->w1_dev, &new_setting,
+	ret = ds2780_write(dev_info, &new_setting,
 				DS2780_SFR_REG, sizeof(u8));
 	if (ret < 0)
 		return ret;
@@ -632,7 +640,7 @@ static ssize_t ds2780_read_param_eeprom_bin(struct file *filp,
 		DS2780_EEPROM_BLOCK1_END -
 		DS2780_EEPROM_BLOCK1_START + 1 - off);
 
-	return ds2780_read_block(dev_info->w1_dev, buf,
+	return ds2780_read_block(dev_info, buf,
 				DS2780_EEPROM_BLOCK1_START + off, count);
 }
 
@@ -650,7 +658,7 @@ static ssize_t ds2780_write_param_eeprom_bin(struct file *filp,
 		DS2780_EEPROM_BLOCK1_END -
 		DS2780_EEPROM_BLOCK1_START + 1 - off);
 
-	ret = ds2780_write(dev_info->w1_dev, buf,
+	ret = ds2780_write(dev_info, buf,
 				DS2780_EEPROM_BLOCK1_START + off, count);
 	if (ret < 0)
 		return ret;
@@ -685,9 +693,8 @@ static ssize_t ds2780_read_user_eeprom_bin(struct file *filp,
 		DS2780_EEPROM_BLOCK0_END -
 		DS2780_EEPROM_BLOCK0_START + 1 - off);
 
-	return ds2780_read_block(dev_info->w1_dev, buf,
+	return ds2780_read_block(dev_info, buf,
 				DS2780_EEPROM_BLOCK0_START + off, count);
-
 }
 
 static ssize_t ds2780_write_user_eeprom_bin(struct file *filp,
@@ -704,7 +711,7 @@ static ssize_t ds2780_write_user_eeprom_bin(struct file *filp,
 		DS2780_EEPROM_BLOCK0_END -
 		DS2780_EEPROM_BLOCK0_START + 1 - off);
 
-	ret = ds2780_write(dev_info->w1_dev, buf,
+	ret = ds2780_write(dev_info, buf,
 				DS2780_EEPROM_BLOCK0_START + off, count);
 	if (ret < 0)
 		return ret;
-- 
cgit v0.10.2


From 9fe678fa2feb4aaac0b4220de63e1b7f8ccebae6 Mon Sep 17 00:00:00 2001
From: Clifton Barnes <cabarnes@indesign-llc.com>
Date: Wed, 2 Nov 2011 13:39:52 -0700
Subject: drivers/power/ds2780_battery.c: add a nolock function to w1 interface

Adds a nolock function to the w1 interface to avoid locking the
mutex if needed.

Signed-off-by: Clifton Barnes <cabarnes@indesign-llc.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/w1/slaves/w1_ds2780.c b/drivers/w1/slaves/w1_ds2780.c
index a134b38..39f78c0 100644
--- a/drivers/w1/slaves/w1_ds2780.c
+++ b/drivers/w1/slaves/w1_ds2780.c
@@ -26,20 +26,14 @@
 #include "../w1_family.h"
 #include "w1_ds2780.h"
 
-int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count,
-			int io)
+static int w1_ds2780_do_io(struct device *dev, char *buf, int addr,
+			size_t count, int io)
 {
 	struct w1_slave *sl = container_of(dev, struct w1_slave, dev);
 
-	if (!dev)
-		return -ENODEV;
+	if (addr > DS2780_DATA_SIZE || addr < 0)
+		return 0;
 
-	mutex_lock(&sl->master->mutex);
-
-	if (addr > DS2780_DATA_SIZE || addr < 0) {
-		count = 0;
-		goto out;
-	}
 	count = min_t(int, count, DS2780_DATA_SIZE - addr);
 
 	if (w1_reset_select_slave(sl) == 0) {
@@ -47,7 +41,6 @@ int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count,
 			w1_write_8(sl->master, W1_DS2780_WRITE_DATA);
 			w1_write_8(sl->master, addr);
 			w1_write_block(sl->master, buf, count);
-			/* XXX w1_write_block returns void, not n_written */
 		} else {
 			w1_write_8(sl->master, W1_DS2780_READ_DATA);
 			w1_write_8(sl->master, addr);
@@ -55,13 +48,42 @@ int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count,
 		}
 	}
 
-out:
+	return count;
+}
+
+int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count,
+			int io)
+{
+	struct w1_slave *sl = container_of(dev, struct w1_slave, dev);
+	int ret;
+
+	if (!dev)
+		return -ENODEV;
+
+	mutex_lock(&sl->master->mutex);
+
+	ret = w1_ds2780_do_io(dev, buf, addr, count, io);
+
 	mutex_unlock(&sl->master->mutex);
 
-	return count;
+	return ret;
 }
 EXPORT_SYMBOL(w1_ds2780_io);
 
+int w1_ds2780_io_nolock(struct device *dev, char *buf, int addr, size_t count,
+			int io)
+{
+	int ret;
+
+	if (!dev)
+		return -ENODEV;
+
+	ret = w1_ds2780_do_io(dev, buf, addr, count, io);
+
+	return ret;
+}
+EXPORT_SYMBOL(w1_ds2780_io_nolock);
+
 int w1_ds2780_eeprom_cmd(struct device *dev, int addr, int cmd)
 {
 	struct w1_slave *sl = container_of(dev, struct w1_slave, dev);
diff --git a/drivers/w1/slaves/w1_ds2780.h b/drivers/w1/slaves/w1_ds2780.h
index a1fba79..7373793 100644
--- a/drivers/w1/slaves/w1_ds2780.h
+++ b/drivers/w1/slaves/w1_ds2780.h
@@ -124,6 +124,8 @@
 
 extern int w1_ds2780_io(struct device *dev, char *buf, int addr, size_t count,
 			int io);
+extern int w1_ds2780_io_nolock(struct device *dev, char *buf, int addr,
+			size_t count, int io);
 extern int w1_ds2780_eeprom_cmd(struct device *dev, int addr, int cmd);
 
 #endif /* !_W1_DS2780_H */
-- 
cgit v0.10.2


From 0e053fcbbbc4d945247cb32cad2767b483cb65f8 Mon Sep 17 00:00:00 2001
From: Clifton Barnes <cabarnes@indesign-llc.com>
Date: Wed, 2 Nov 2011 13:39:55 -0700
Subject: drivers/power/ds2780_battery.c: fix deadlock upon insertion and
 removal

Fixes the deadlock when inserting and removing the ds2780.

Signed-off-by: Clifton Barnes <cabarnes@indesign-llc.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: <stable@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/power/ds2780_battery.c b/drivers/power/ds2780_battery.c
index 2571b41..91a783d 100644
--- a/drivers/power/ds2780_battery.c
+++ b/drivers/power/ds2780_battery.c
@@ -39,6 +39,7 @@ struct ds2780_device_info {
 	struct device *dev;
 	struct power_supply bat;
 	struct device *w1_dev;
+	struct task_struct *mutex_holder;
 };
 
 enum current_types {
@@ -63,6 +64,9 @@ static inline struct power_supply *to_power_supply(struct device *dev)
 static inline int ds2780_battery_io(struct ds2780_device_info *dev_info,
 	char *buf, int addr, size_t count, int io)
 {
+	if (dev_info->mutex_holder == current)
+		return w1_ds2780_io_nolock(dev_info->w1_dev, buf, addr, count, io);
+	else
 		return w1_ds2780_io(dev_info->w1_dev, buf, addr, count, io);
 }
 
@@ -775,6 +779,7 @@ static int __devinit ds2780_battery_probe(struct platform_device *pdev)
 	dev_info->bat.properties	= ds2780_battery_props;
 	dev_info->bat.num_properties	= ARRAY_SIZE(ds2780_battery_props);
 	dev_info->bat.get_property	= ds2780_battery_get_property;
+	dev_info->mutex_holder		= current;
 
 	ret = power_supply_register(&pdev->dev, &dev_info->bat);
 	if (ret) {
@@ -804,6 +809,8 @@ static int __devinit ds2780_battery_probe(struct platform_device *pdev)
 		goto fail_remove_bin_file;
 	}
 
+	dev_info->mutex_holder = NULL;
+
 	return 0;
 
 fail_remove_bin_file:
@@ -823,6 +830,8 @@ static int __devexit ds2780_battery_remove(struct platform_device *pdev)
 {
 	struct ds2780_device_info *dev_info = platform_get_drvdata(pdev);
 
+	dev_info->mutex_holder = current;
+
 	/* remove attributes */
 	sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2780_attr_group);
 
-- 
cgit v0.10.2


From 68a436aec345c2bcd05dbdafae1f5f608ff8f61f Mon Sep 17 00:00:00 2001
From: Florian Faber <faber@faberman.de>
Date: Wed, 2 Nov 2011 13:39:59 -0700
Subject: drivers/w1/w1_int.c: multiple masters used same init_name

When using multiple masters, w1_int.c would use the .init_name from w1.c
for all entities, which will fail when creating a corresponding sysfs
entry.  This patch uses the unique name previously generated.

  WARNING: at fs/sysfs/dir.c:451 sysfs_add_one+0x48/0x64()
  sysfs: cannot create duplicate filename '/devices/w1 bus master'
  Modules linked in:
  Call trace:
   [<9001a604>] warn_slowpath_common+0x34/0x44
   [<9001a64c>] warn_slowpath_fmt+0x14/0x18
   [<90078020>] sysfs_add_one+0x48/0x64
   [<900784ec>] create_dir+0x40/0x68
   [<9007857a>] sysfs_create_dir+0x66/0x78
   [<900c1a8a>] kobject_add_internal+0x6e/0x104
   [<900c1bc0>] kobject_add_varg+0x20/0x2c
   [<900c1c1c>] kobject_add+0x30/0x3c
   [<900dbd66>] device_add+0x6a/0x378
   [<900dbb4a>] device_initialize+0x12/0x48
   [<900dc080>] device_register+0xc/0x10
   [<900f99be>] w1_add_master_device+0x162/0x274
   [<90008e7a>] w1_gpio_probe+0x66/0xb4
   [<9000030c>] kernel_init+0x0/0xe8
   [<900dde54>] platform_drv_probe+0xc/0xe
   [<9000030c>] kernel_init+0x0/0xe8
   [<900dd4f8>] driver_probe_device+0x6c/0xdc
   [<900dd5fc>] __driver_attach+0x34/0x48
   [<900dcce8>] bus_for_each_dev+0x2c/0x48
   [<900dd5c8>] __driver_attach+0x0/0x48
   [<900dd38c>] driver_attach+0x10/0x14
   [<900dd16a>] bus_add_driver+0x6a/0x18c
   [<900dd768>] driver_register+0x60/0xb8
   [<90011594>] __initcall_w1_therm_init6+0x0/0x4
   [<90008e00>] w1_gpio_init+0x0/0x14
   [<9000030c>] kernel_init+0x0/0xe8
   [<900ddf48>] platform_driver_register+0x30/0x38
   [<90011594>] __initcall_w1_therm_init6+0x0/0x4
   [<90008e00>] w1_gpio_init+0x0/0x14
   [<9000030c>] kernel_init+0x0/0xe8
   [<900ddf5e>] platform_driver_probe+0xe/0x3c
   [<90008e0c>] w1_gpio_init+0xc/0x14
   [<90011594>] __initcall_w1_therm_init6+0x0/0x4
   [<90008e00>] w1_gpio_init+0x0/0x14
   [<900126d4>] do_one_initcall+0x34/0x130
   [<90000372>] kernel_init+0x66/0xe8
   [<90011594>] __initcall_w1_therm_init6+0x0/0x4
   [<9001ca3e>] do_exit+0x0/0x3a6
   [<9000030c>] kernel_init+0x0/0xe8
   [<9001ca3e>] do_exit+0x0/0x3a6

  ---[ end trace 5a9233884fead918 ]---
  kobject_add_internal failed for w1 bus master with -EEXIST, don't try to register things with the same name in the same directory.

Signed-off-by: Florian Faber <faber@faberman.de>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index d220bce..f79e62e 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -78,6 +78,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl,
 	memcpy(&dev->dev, device, sizeof(struct device));
 	dev_set_name(&dev->dev, "w1_bus_master%u", dev->id);
 	snprintf(dev->name, sizeof(dev->name), "w1_bus_master%u", dev->id);
+	dev->dev.init_name = dev->name;
 
 	dev->driver = driver;
 
-- 
cgit v0.10.2


From 3fd306c85adcde7209281cb663dd8ea247e97cc3 Mon Sep 17 00:00:00 2001
From: Jan Weitzel <j.weitzel@phytec.de>
Date: Wed, 2 Nov 2011 13:40:02 -0700
Subject: w1: disable irqs in critical section

Interrupting w1_delay() in w1_read_bit() results in missing the low level
on the w1 line and receiving "1" instead of "0".

Add local_irq_save()/local_irq_restore() around the critical section

Signed-off-by: Jan Weitzel <j.weitzel@phytec.de>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c
index 765b37b..3135b2c 100644
--- a/drivers/w1/w1_io.c
+++ b/drivers/w1/w1_io.c
@@ -158,13 +158,18 @@ EXPORT_SYMBOL_GPL(w1_write_8);
 static u8 w1_read_bit(struct w1_master *dev)
 {
 	int result;
+	unsigned long flags;
 
+	/* sample timing is critical here */
+	local_irq_save(flags);
 	dev->bus_master->write_bit(dev->bus_master->data, 0);
 	w1_delay(6);
 	dev->bus_master->write_bit(dev->bus_master->data, 1);
 	w1_delay(9);
 
 	result = dev->bus_master->read_bit(dev->bus_master->data);
+	local_irq_restore(flags);
+
 	w1_delay(55);
 
 	return result & 0x1;
-- 
cgit v0.10.2


From 6d994a7e42ab219ba3c10d5ffccf20990252881e Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Wed, 2 Nov 2011 13:40:04 -0700
Subject: drivers/misc/vmw_balloon.c: determine page allocation flag can_sleep
 outside loop

In vmballoon_reserve_page(), flags has been passed from the callee
function (vmballoon_inflate here).  So, we can determine can_sleep outside
the loop.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Acked-by: Dmitry Torokhov <dtor@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 053d36c..6983d80 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -412,6 +412,7 @@ static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
 	gfp_t flags;
 	unsigned int hv_status;
 	bool locked = false;
+	flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
 
 	do {
 		if (!can_sleep)
@@ -419,7 +420,6 @@ static int vmballoon_reserve_page(struct vmballoon *b, bool can_sleep)
 		else
 			STATS_INC(b->stats.sleep_alloc);
 
-		flags = can_sleep ? VMW_PAGE_ALLOC_CANSLEEP : VMW_PAGE_ALLOC_NOSLEEP;
 		page = alloc_page(flags);
 		if (!page) {
 			if (!can_sleep)
-- 
cgit v0.10.2


From 2ca02df6b098be2d33a99a65531dcd84a10b6e21 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Wed, 2 Nov 2011 13:40:07 -0700
Subject: drivers/misc/vmw_balloon.c: fix typo in code comment

Fix typo in code comment.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Acked-by: Dmitry Torokhov <dtor@vmware.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 6983d80..cd41d40 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -151,7 +151,7 @@ MODULE_LICENSE("GPL");
 struct vmballoon_stats {
 	unsigned int timer;
 
-	/* allocation statustics */
+	/* allocation statistics */
 	unsigned int alloc;
 	unsigned int alloc_fail;
 	unsigned int sleep_alloc;
-- 
cgit v0.10.2


From 080d676de095a14ecba14c0b9a91acb5bbb634df Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 2 Nov 2011 13:40:10 -0700
Subject: aio: allocate kiocbs in batches

In testing aio on a fast storage device, I found that the context lock
takes up a fair amount of cpu time in the I/O submission path.  The reason
is that we take it for every I/O submitted (see __aio_get_req).  Since we
know how many I/Os are passed to io_submit, we can preallocate the kiocbs
in batches, reducing the number of times we take and release the lock.

In my testing, I was able to reduce the amount of time spent in
_raw_spin_lock_irq by .56% (average of 3 runs).  The command I used to
test this was:

   aio-stress -O -o 2 -o 3 -r 8 -d 128 -b 32 -i 32 -s 16384 <dev>

I also tested the patch with various numbers of events passed to
io_submit, and I ran the xfstests aio group of tests to ensure I didn't
break anything.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Daniel Ehrenberg <dehrenberg@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/aio.c b/fs/aio.c
index 632b235..78c514c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -440,8 +440,6 @@ void exit_aio(struct mm_struct *mm)
 static struct kiocb *__aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req = NULL;
-	struct aio_ring *ring;
-	int okay = 0;
 
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
 	if (unlikely(!req))
@@ -459,39 +457,114 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 	INIT_LIST_HEAD(&req->ki_run_list);
 	req->ki_eventfd = NULL;
 
-	/* Check if the completion queue has enough free space to
-	 * accept an event from this io.
-	 */
+	return req;
+}
+
+/*
+ * struct kiocb's are allocated in batches to reduce the number of
+ * times the ctx lock is acquired and released.
+ */
+#define KIOCB_BATCH_SIZE	32L
+struct kiocb_batch {
+	struct list_head head;
+	long count; /* number of requests left to allocate */
+};
+
+static void kiocb_batch_init(struct kiocb_batch *batch, long total)
+{
+	INIT_LIST_HEAD(&batch->head);
+	batch->count = total;
+}
+
+static void kiocb_batch_free(struct kiocb_batch *batch)
+{
+	struct kiocb *req, *n;
+
+	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+		list_del(&req->ki_batch);
+		kmem_cache_free(kiocb_cachep, req);
+	}
+}
+
+/*
+ * Allocate a batch of kiocbs.  This avoids taking and dropping the
+ * context lock a lot during setup.
+ */
+static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
+{
+	unsigned short allocated, to_alloc;
+	long avail;
+	bool called_fput = false;
+	struct kiocb *req, *n;
+	struct aio_ring *ring;
+
+	to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
+	for (allocated = 0; allocated < to_alloc; allocated++) {
+		req = __aio_get_req(ctx);
+		if (!req)
+			/* allocation failed, go with what we've got */
+			break;
+		list_add(&req->ki_batch, &batch->head);
+	}
+
+	if (allocated == 0)
+		goto out;
+
+retry:
 	spin_lock_irq(&ctx->ctx_lock);
-	ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
-	if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
+	ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
+
+	avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+	BUG_ON(avail < 0);
+	if (avail == 0 && !called_fput) {
+		/*
+		 * Handle a potential starvation case.  It is possible that
+		 * we hold the last reference on a struct file, causing us
+		 * to delay the final fput to non-irq context.  In this case,
+		 * ctx->reqs_active is artificially high.  Calling the fput
+		 * routine here may free up a slot in the event completion
+		 * ring, allowing this allocation to succeed.
+		 */
+		kunmap_atomic(ring);
+		spin_unlock_irq(&ctx->ctx_lock);
+		aio_fput_routine(NULL);
+		called_fput = true;
+		goto retry;
+	}
+
+	if (avail < allocated) {
+		/* Trim back the number of requests. */
+		list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
+			list_del(&req->ki_batch);
+			kmem_cache_free(kiocb_cachep, req);
+			if (--allocated <= avail)
+				break;
+		}
+	}
+
+	batch->count -= allocated;
+	list_for_each_entry(req, &batch->head, ki_batch) {
 		list_add(&req->ki_list, &ctx->active_reqs);
 		ctx->reqs_active++;
-		okay = 1;
 	}
-	kunmap_atomic(ring, KM_USER0);
-	spin_unlock_irq(&ctx->ctx_lock);
 
-	if (!okay) {
-		kmem_cache_free(kiocb_cachep, req);
-		req = NULL;
-	}
+	kunmap_atomic(ring);
+	spin_unlock_irq(&ctx->ctx_lock);
 
-	return req;
+out:
+	return allocated;
 }
 
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx,
+					struct kiocb_batch *batch)
 {
 	struct kiocb *req;
-	/* Handle a potential starvation case -- should be exceedingly rare as 
-	 * requests will be stuck on fput_head only if the aio_fput_routine is 
-	 * delayed and the requests were the last user of the struct file.
-	 */
-	req = __aio_get_req(ctx);
-	if (unlikely(NULL == req)) {
-		aio_fput_routine(NULL);
-		req = __aio_get_req(ctx);
-	}
+
+	if (list_empty(&batch->head))
+		if (kiocb_batch_refill(ctx, batch) == 0)
+			return NULL;
+	req = list_first_entry(&batch->head, struct kiocb, ki_batch);
+	list_del(&req->ki_batch);
 	return req;
 }
 
@@ -1515,7 +1588,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, bool compat)
+			 struct iocb *iocb, struct kiocb_batch *batch,
+			 bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1541,7 +1615,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!file))
 		return -EBADF;
 
-	req = aio_get_req(ctx);		/* returns with 2 references to req */
+	req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
 	if (unlikely(!req)) {
 		fput(file);
 		return -EAGAIN;
@@ -1621,8 +1695,9 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 {
 	struct kioctx *ctx;
 	long ret = 0;
-	int i;
+	int i = 0;
 	struct blk_plug plug;
+	struct kiocb_batch batch;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1639,6 +1714,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		return -EINVAL;
 	}
 
+	kiocb_batch_init(&batch, nr);
+
 	blk_start_plug(&plug);
 
 	/*
@@ -1659,12 +1736,13 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 
+	kiocb_batch_free(&batch);
 	put_ioctx(ctx);
 	return i ? i : ret;
 }
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 2dcb72b..2314ad8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -117,6 +117,7 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
+	struct list_head	ki_batch;	/* batch allocation */
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-- 
cgit v0.10.2


From c1e2ee2dc436574880758b3836fc96935b774c32 Mon Sep 17 00:00:00 2001
From: Andrew Bresticker <abrestic@google.com>
Date: Wed, 2 Nov 2011 13:40:29 -0700
Subject: memcg: replace ss->id_lock with a rwlock

While back-porting Johannes Weiner's patch "mm: memcg-aware global
reclaim" for an internal effort, we noticed a significant performance
regression during page-reclaim heavy workloads due to high contention of
the ss->id_lock.  This lock protects idr map, and serializes calls to
idr_get_next() in css_get_next() (which is used during the memcg hierarchy
walk).

Since idr_get_next() is just doing a look up, we need only serialize it
with respect to idr_remove()/idr_get_new().  By making the ss->id_lock a
rwlock, contention is greatly reduced and performance improves.

Tested: cat a 256m file from a ramdisk in a 128m container 50 times on
each core (one file + container per core) in parallel on a NUMA machine.
Result is the time for the test to complete in 1 of the containers.
Both kernels included Johannes' memcg-aware global reclaim patches.

Before rwlock patch: 1710.778s
After rwlock patch: 152.227s

Signed-off-by: Andrew Bresticker <abrestic@google.com>
Cc: Paul Menage <menage@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da7e4bc..1b7f9d5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -516,7 +516,7 @@ struct cgroup_subsys {
 	struct list_head sibling;
 	/* used when use_id == true */
 	struct idr idr;
-	spinlock_t id_lock;
+	rwlock_t id_lock;
 
 	/* should be defined only by modular subsystems */
 	struct module *module;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8386b21..d9d5648 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4883,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 
 	rcu_assign_pointer(id->css, NULL);
 	rcu_assign_pointer(css->id, NULL);
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	idr_remove(&ss->idr, id->id);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 	kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4911,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 		error = -ENOMEM;
 		goto err_out;
 	}
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	/* Don't use 0. allocates an ID of 1-65535 */
 	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 
 	/* Returns error when there are no free spaces for new ID.*/
 	if (error) {
@@ -4929,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 	return newid;
 remove_idr:
 	error = -ENOSPC;
-	spin_lock(&ss->id_lock);
+	write_lock(&ss->id_lock);
 	idr_remove(&ss->idr, myid);
-	spin_unlock(&ss->id_lock);
+	write_unlock(&ss->id_lock);
 err_out:
 	kfree(newid);
 	return ERR_PTR(error);
@@ -4943,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
 	struct css_id *newid;
 
-	spin_lock_init(&ss->id_lock);
+	rwlock_init(&ss->id_lock);
 	idr_init(&ss->idr);
 
 	newid = get_new_cssid(ss, 0);
@@ -5038,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
 		 * scan next entry from bitmap(tree), tmpid is updated after
 		 * idr_get_next().
 		 */
-		spin_lock(&ss->id_lock);
+		read_lock(&ss->id_lock);
 		tmp = idr_get_next(&ss->idr, &tmpid);
-		spin_unlock(&ss->id_lock);
+		read_unlock(&ss->id_lock);
 
 		if (!tmp)
 			break;
-- 
cgit v0.10.2