From f4245bd4ebf903541ba758ad06c118626d8c6f18 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 2 Nov 2010 14:07:17 -0400
Subject: ext4: fix lazyinit hang after removing request

When the request has been removed from the list and no other request
has been issued, we will end up with next wakeup scheduled to
MAX_JIFFY_OFFSET which is bad. So check for that.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b7..8d1d942 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2740,7 +2740,8 @@ cont_thread:
 		if (freezing(current))
 			refrigerator();
 
-		if (time_after_eq(jiffies, next_wakeup)) {
+		if ((time_after_eq(jiffies, next_wakeup)) ||
+		    (MAX_JIFFY_OFFSET == next_wakeup)) {
 			cond_resched();
 			continue;
 		}
-- 
cgit v0.10.2


From b2c78cd09b6ef78c8f20190f0b3e6df1d3651b70 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 2 Nov 2010 14:19:30 -0400
Subject: ext4: "ret" may be used uninitialized in ext4_lazyinit_thread()

Newer GCC's reported the following build warning:

   fs/ext4/super.c: In function 'ext4_lazyinit_thread':
   fs/ext4/super.c:2702: warning: 'ret' may be used uninitialized in this function

Fix it by removing the need for the ret variable in the first place.

Signed-off-by: "Lukas Czerner" <lczerner@redhat.com>
Reported-by: "Stefan Richter" <stefanr@s5r6.in-berlin.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8d1d942..4d7ef31 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2699,7 +2699,6 @@ static int ext4_lazyinit_thread(void *arg)
 	struct ext4_li_request *elr;
 	unsigned long next_wakeup;
 	DEFINE_WAIT(wait);
-	int ret;
 
 	BUG_ON(NULL == eli);
 
@@ -2723,13 +2722,12 @@ cont_thread:
 			elr = list_entry(pos, struct ext4_li_request,
 					 lr_request);
 
-			if (time_after_eq(jiffies, elr->lr_next_sched))
-				ret = ext4_run_li_request(elr);
-
-			if (ret) {
-				ret = 0;
-				ext4_remove_li_request(elr);
-				continue;
+			if (time_after_eq(jiffies, elr->lr_next_sched)) {
+				if (ext4_run_li_request(elr) != 0) {
+					/* error, remove the lazy_init job */
+					ext4_remove_li_request(elr);
+					continue;
+				}
 			}
 
 			if (time_before(elr->lr_next_sched, next_wakeup))
-- 
cgit v0.10.2


From ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 3 Nov 2010 12:03:21 -0400
Subject: ext4: initialize the percpu counters before replaying the journal

We now initialize the percpu counters before replaying the journal,
but after the journal, we recalculate the global counters, to deal
with the possibility of the per-blockgroup counts getting updated by
the journal replay.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4d7ef31..04352e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3347,6 +3347,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+			ext4_count_free_blocks(sb));
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext4_count_free_inodes(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+				ext4_count_dirs(sb));
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount3;
+	}
+
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
@@ -3445,22 +3463,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
-no_journal:
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-				  ext4_count_free_blocks(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-					  ext4_count_free_inodes(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-					  ext4_count_dirs(sb));
-	if (!err)
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
-		goto failed_mount_wq;
-	}
+	/*
+	 * The journal may have updated the bg summary counts, so we
+	 * need to update the global counters.
+	 */
+	percpu_counter_set(&sbi->s_freeblocks_counter,
+			   ext4_count_free_blocks(sb));
+	percpu_counter_set(&sbi->s_freeinodes_counter,
+			   ext4_count_free_inodes(sb));
+	percpu_counter_set(&sbi->s_dirs_counter,
+			   ext4_count_dirs(sb));
+	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 
+no_journal:
 	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3610,10 +3625,6 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3621,6 +3632,10 @@ failed_mount3:
 		else
 			kfree(sbi->s_flex_groups);
 	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -3948,13 +3963,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
-		ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-					&EXT4_SB(sb)->s_freeblocks_counter));
-	if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
-		es->s_free_inodes_count =
-			cpu_to_le32(percpu_counter_sum_positive(
-					&EXT4_SB(sb)->s_freeinodes_counter));
+	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+					   &EXT4_SB(sb)->s_freeblocks_counter));
+	es->s_free_inodes_count =
+		cpu_to_le32(percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeinodes_counter));
 	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-- 
cgit v0.10.2


From f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:43:33 -0500
Subject: ext4: handle writeback of inodes which are being freed

The following BUG can occur when an inode which is getting freed when
it still has dirty pages outstanding, and it gets deleted (in this
because it was the target of a rename).  In ordered mode, we need to
make sure the data pages are written just in case we crash before the
rename (or unlink) is committed.  If the inode is being freed then
when we try to igrab the inode, we end up tripping the BUG_ON at
fs/ext4/page-io.c:146.

To solve this problem, we need to keep track of the number of io
callbacks which are pending, and avoid destroying the inode until they
have all been completed.  That way we don't have to bump the inode
count to keep the inode from being destroyed; an approach which
doesn't work because the count could have already been dropped down to
zero before the inode writeback has started (at which point we're not
allowed to bump the count back up to 1, since it's already started
getting freed).

Thanks to Dave Chinner for suggesting this approach, which is also
used by XFS.

  kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146!
  Call Trace:
   [<ffffffff811075b1>] ext4_bio_write_page+0x172/0x307
   [<ffffffff811033a7>] mpage_da_submit_io+0x2f9/0x37b
   [<ffffffff811068d7>] mpage_da_map_and_submit+0x2cc/0x2e2
   [<ffffffff811069b3>] mpage_add_bh_to_extent+0xc6/0xd5
   [<ffffffff81106c66>] write_cache_pages_da+0x2a4/0x3ac
   [<ffffffff81107044>] ext4_da_writepages+0x2d6/0x44d
   [<ffffffff81087910>] do_writepages+0x1c/0x25
   [<ffffffff810810a4>] __filemap_fdatawrite_range+0x4b/0x4d
   [<ffffffff810815f5>] filemap_fdatawrite_range+0xe/0x10
   [<ffffffff81122a2e>] jbd2_journal_begin_ordered_truncate+0x7b/0xa2
   [<ffffffff8110615d>] ext4_evict_inode+0x57/0x24c
   [<ffffffff810c14a3>] evict+0x22/0x92
   [<ffffffff810c1a3d>] iput+0x212/0x249
   [<ffffffff810bdf16>] dentry_iput+0xa1/0xb9
   [<ffffffff810bdf6b>] d_kill+0x3d/0x5d
   [<ffffffff810be613>] dput+0x13a/0x147
   [<ffffffff810b990d>] sys_renameat+0x1b5/0x258
   [<ffffffff81145f71>] ? _atomic_dec_and_lock+0x2d/0x4c
   [<ffffffff810b2950>] ? cp_new_stat+0xde/0xea
   [<ffffffff810b29c1>] ? sys_newlstat+0x2d/0x38
   [<ffffffff810b99c6>] sys_rename+0x16/0x18
   [<ffffffff81002a2b>] system_call_fastpath+0x16/0x1b

Reported-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Tested-by: Nick Bowler <nbowler@elliptictech.com>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd63..670d134 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -858,6 +858,7 @@ struct ext4_inode_info {
 	spinlock_t i_completed_io_lock;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 
 	/*
 	 * Transactions that contain inode's metadata needed to complete
@@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a..a24c8cc 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
+#define WQ_HASH_SZ		37
+#define to_ioend_wq(v)	(&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+
 int __init ext4_init_pageio(void)
 {
+	int i;
+
 	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
 	if (io_page_cachep == NULL)
 		return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
 		kmem_cache_destroy(io_page_cachep);
 		return -ENOMEM;
 	}
+	for (i = 0; i < WQ_HASH_SZ; i++)
+		init_waitqueue_head(&ioend_wq[i]);
 
 	return 0;
 }
@@ -52,9 +60,17 @@ void ext4_exit_pageio(void)
 	kmem_cache_destroy(io_page_cachep);
 }
 
+void ext4_ioend_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = to_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
 void ext4_free_io_end(ext4_io_end_t *io)
 {
 	int i;
+	wait_queue_head_t *wq;
 
 	BUG_ON(!io);
 	if (io->page)
@@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io)
 		}
 	}
 	io->num_io_pages = 0;
-	iput(io->inode);
+	wq = to_ioend_wq(io->inode);
+	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+	    waitqueue_active(wq))
+		wake_up_all(wq);
 	kmem_cache_free(io_end_cachep, io);
 }
 
@@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 	io = kmem_cache_alloc(io_end_cachep, flags);
 	if (io) {
 		memset(io, 0, sizeof(*io));
-		io->inode = igrab(inode);
-		BUG_ON(!io->inode);
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
 		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
 	}
@@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error)
 	struct workqueue_struct *wq;
 	struct inode *inode;
 	unsigned long flags;
-	ext4_fsblk_t err_block;
 	int i;
 
 	BUG_ON(!io_end);
-	inode = io_end->inode;
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = 0;
-	err_block = bio->bi_sector >> (inode->i_blkbits - 9);
 	bio_put(bio);
 
-	if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
-		pr_err("sb umounted, discard end_io request for inode %lu\n",
-			io_end->inode->i_ino);
-		ext4_free_io_end(io_end);
-		return;
-	}
-
-	if (error) {
-		io_end->flag |= EXT4_IO_END_ERROR;
-		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
-			     "(offset %llu size %ld starting block %llu)",
-			     inode->i_ino,
-			     (unsigned long long) io_end->offset,
-			     (long) io_end->size,
-			     (unsigned long long) err_block);
-	}
-
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
@@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error)
 		if (!partial_write)
 			SetPageUptodate(page);
 	}
-
 	io_end->num_io_pages = 0;
+	inode = io_end->inode;
+
+	if (error) {
+		io_end->flag |= EXT4_IO_END_ERROR;
+		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long)
+			     bio->bi_sector >> (inode->i_blkbits - 9));
+	}
 
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io,
 	bio->bi_private = io->io_end = io_end;
 	bio->bi_end_io = ext4_end_bio;
 
-	io_end->inode = inode;
 	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
 
 	io->io_bio = bio;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04352e9..45653af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
+	atomic_set(&ei->i_ioend_count, 0);
 
 	return &ei->vfs_inode;
 }
 
 static void ext4_destroy_inode(struct inode *inode)
 {
+	ext4_ioend_wait(inode);
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
 		ext4_msg(inode->i_sb, KERN_ERR,
 			 "Inode %lu (%p): orphan list check failed!",
-- 
cgit v0.10.2


From 83668e7141c7a0aa4035bde94344b81f9cf966ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:45:33 -0500
Subject: ext4: fix potential race when freeing ext4_io_page structures

Use an atomic_t and make sure we don't free the structure while we
might still be submitting I/O for that page.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 670d134..6a5edea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
 
 struct ext4_io_page {
 	struct page	*p_page;
-	int		p_count;
+	atomic_t	p_count;
 };
 
 #define MAX_IO_PAGES 128
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a24c8cc..7f5451c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -67,6 +67,15 @@ void ext4_ioend_wait(struct inode *inode)
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
 
+static void put_io_page(struct ext4_io_page *io_page)
+{
+	if (atomic_dec_and_test(&io_page->p_count)) {
+		end_page_writeback(io_page->p_page);
+		put_page(io_page->p_page);
+		kmem_cache_free(io_page_cachep, io_page);
+	}
+}
+
 void ext4_free_io_end(ext4_io_end_t *io)
 {
 	int i;
@@ -75,15 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	BUG_ON(!io);
 	if (io->page)
 		put_page(io->page);
-	for (i = 0; i < io->num_io_pages; i++) {
-		if (--io->pages[i]->p_count == 0) {
-			struct page *page = io->pages[i]->p_page;
-
-			end_page_writeback(page);
-			put_page(page);
-			kmem_cache_free(io_page_cachep, io->pages[i]);
-		}
-	}
+	for (i = 0; i < io->num_io_pages; i++)
+		put_io_page(io->pages[i]);
 	io->num_io_pages = 0;
 	wq = to_ioend_wq(io->inode);
 	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
@@ -235,13 +237,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 			} while (bh != head);
 		}
 
-		if (--io_end->pages[i]->p_count == 0) {
-			struct page *page = io_end->pages[i]->p_page;
-
-			end_page_writeback(page);
-			put_page(page);
-			kmem_cache_free(io_page_cachep, io_end->pages[i]);
-		}
+		put_io_page(io_end->pages[i]);
 
 		/*
 		 * If this is a partial write which happened to make
@@ -369,7 +365,7 @@ submit_and_retry:
 	if ((io_end->num_io_pages == 0) ||
 	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
 		io_end->pages[io_end->num_io_pages++] = io_page;
-		io_page->p_count++;
+		atomic_inc(&io_page->p_count);
 	}
 	return 0;
 }
@@ -398,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		return -ENOMEM;
 	}
 	io_page->p_page = page;
-	io_page->p_count = 0;
+	atomic_set(&io_page->p_count, 1);
 	get_page(page);
 
 	for (bh = head = page_buffers(page), block_start = 0;
@@ -430,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	 * PageWriteback bit from the page to prevent the system from
 	 * wedging later on.
 	 */
-	if (io_page->p_count == 0) {
-		put_page(page);
-		end_page_writeback(page);
-		kmem_cache_free(io_page_cachep, io_page);
-	}
+	put_io_page(io_page);
 	return ret;
 }
-- 
cgit v0.10.2


From 87009d86dc045d228e21242467a67a5f99347553 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 8 Nov 2010 13:47:33 -0500
Subject: ext4: do not try to grab the s_umount semaphore in ext4_quota_off

It's not needed to sync the filesystem, and it fixes a lock_dep complaint.

Signed-off-by: Dmitry Monakhov <dmonakhov@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45653af..ee91e29d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4570,12 +4570,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 static int ext4_quota_off(struct super_block *sb, int type)
 {
-	/* Force all delayed allocation blocks to be allocated */
-	if (test_opt(sb, DELALLOC)) {
-		down_read(&sb->s_umount);
+	/* Force all delayed allocation blocks to be allocated.
+	 * Caller already holds s_umount sem */
+	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
-		up_read(&sb->s_umount);
-	}
 
 	return dquot_quota_off(sb, type);
 }
-- 
cgit v0.10.2


From b56ff9d397cecdaad6c98c9d57cc6fea475e1f50 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:49:33 -0500
Subject: ext4: Don't call sb_issue_discard() in ext4_free_blocks()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5c521830cf (ext4: Support discard requests when running in
no-journal mode) attempts to add sb_issue_discard() for data blocks
(in data=writeback mode) and in no-journal mode.  Unfortunately, this
no longer works, because in commit dd3932eddf (block: remove
BLKDEV_IFL_WAIT), sb_issue_discard() only presents a synchronous
interface, and there are times when we call ext4_free_blocks() when we
are are holding a spinlock, or are otherwise in an atomic context.

For now, I've removed the call to sb_issue_discard() to prevent a
deadlock or (if spinlock debugging is enabled) failures like this:

BUG: scheduling while atomic: rc.sysinit/1376/0x00000002
Pid: 1376, comm: rc.sysinit Not tainted 2.6.36-ARCH #1
Call Trace:
[<ffffffff810397ce>] __schedule_bug+0x5e/0x70
[<ffffffff81403110>] schedule+0x950/0xa70
[<ffffffff81060bad>] ? insert_work+0x7d/0x90
[<ffffffff81060fbd>] ? queue_work_on+0x1d/0x30
[<ffffffff81061127>] ? queue_work+0x37/0x60
[<ffffffff8140377d>] schedule_timeout+0x21d/0x360
[<ffffffff812031c3>] ? generic_make_request+0x2c3/0x540
[<ffffffff81402680>] wait_for_common+0xc0/0x150
[<ffffffff81041490>] ? default_wake_function+0x0/0x10
[<ffffffff812034bc>] ? submit_bio+0x7c/0x100
[<ffffffff810680a0>] ? wake_bit_function+0x0/0x40
[<ffffffff814027b8>] wait_for_completion+0x18/0x20
[<ffffffff8120a969>] blkdev_issue_discard+0x1b9/0x210
[<ffffffff811ba03e>] ext4_free_blocks+0x68e/0xb60
[<ffffffff811b1650>] ? __ext4_handle_dirty_metadata+0x110/0x120
[<ffffffff811b098c>] ext4_ext_truncate+0x8cc/0xa70
[<ffffffff810d713e>] ? pagevec_lookup+0x1e/0x30
[<ffffffff81191618>] ext4_truncate+0x178/0x5d0
[<ffffffff810eacbb>] ? unmap_mapping_range+0xab/0x280
[<ffffffff810d8976>] vmtruncate+0x56/0x70
[<ffffffff811925cb>] ext4_setattr+0x14b/0x460
[<ffffffff811319e4>] notify_change+0x194/0x380
[<ffffffff81117f80>] do_truncate+0x60/0x90
[<ffffffff811e08fa>] ? security_inode_permission+0x1a/0x20
[<ffffffff811eaec1>] ? tomoyo_path_truncate+0x11/0x20
[<ffffffff81127539>] do_last+0x5d9/0x770
[<ffffffff811278bd>] do_filp_open+0x1ed/0x680
[<ffffffff8140644f>] ? page_fault+0x1f/0x30
[<ffffffff81132bfc>] ? alloc_fd+0xec/0x140
[<ffffffff81118db1>] do_sys_open+0x61/0x120
[<ffffffff81118e8b>] sys_open+0x1b/0x20
[<ffffffff81002e6b>] system_call_fastpath+0x16/0x1b

https://bugzilla.kernel.org/show_bug.cgi?id=22302

Reported-by: Mathias Burén <mathias.buren@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: jiayingz@google.com

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34..5b4d4e3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, block_group, bit, count);
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
-- 
cgit v0.10.2


From 7ff9c073dd4d7200399076554f7ab9b876f196f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 8 Nov 2010 13:51:33 -0500
Subject: ext4: Add new ext4 inode tracepoints

Add ext4_evict_inode, ext4_drop_inode, ext4_mark_inode_dirty, and
ext4_begin_ordered_truncate()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1916164..846e1e9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
+	trace_ext4_begin_ordered_truncate(inode, new_size);
 	return jbd2_journal_begin_ordered_truncate(
 					EXT4_SB(inode->i_sb)->s_journal,
 					&EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
 	handle_t *handle;
 	int err;
 
+	trace_ext4_evict_inode(inode);
 	if (inode->i_nlink) {
 		truncate_inode_pages(&inode->i_data, 0);
 		goto no_delete;
@@ -5649,6 +5651,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	int err, ret;
 
 	might_sleep();
+	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee91e29d..61182fe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	return &ei->vfs_inode;
 }
 
+static int ext4_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext4_drop_inode(inode, drop);
+	return drop;
+}
+
 static void ext4_destroy_inode(struct inode *inode)
 {
 	ext4_ioend_wait(inode);
@@ -1175,6 +1183,7 @@ static const struct super_operations ext4_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
 	.sync_fs	= ext4_sync_fs,
@@ -1196,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
 	.write_super	= ext4_write_super,
 	.put_super	= ext4_put_super,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 289010d..e5e345f 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode,
 		  (unsigned long) __entry->dir, __entry->mode)
 );
 
+TRACE_EVENT(ext4_evict_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	int,   dev_major                )
+		__field(	int,   dev_minor                )
+		__field(	ino_t,	ino			)
+		__field(	int,	nlink			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->nlink	= inode->i_nlink;
+	),
+
+	TP_printk("dev %d,%d ino %lu nlink %d",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, __entry->nlink)
+);
+
+TRACE_EVENT(ext4_drop_inode,
+	TP_PROTO(struct inode *inode, int drop),
+
+	TP_ARGS(inode, drop),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(	int,	drop			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->drop	= drop;
+	),
+
+	TP_printk("dev %d,%d ino %lu drop %d",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, __entry->drop)
+);
+
+TRACE_EVENT(ext4_mark_inode_dirty,
+	TP_PROTO(struct inode *inode, unsigned long IP),
+
+	TP_ARGS(inode, IP),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(unsigned long,	ip			)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major = MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor = MINOR(inode->i_sb->s_dev);
+		__entry->ino	= inode->i_ino;
+		__entry->ip	= IP;
+	),
+
+	TP_printk("dev %d,%d ino %lu caller %pF",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino, (void *)__entry->ip)
+);
+
+TRACE_EVENT(ext4_begin_ordered_truncate,
+	TP_PROTO(struct inode *inode, loff_t new_size),
+
+	TP_ARGS(inode, new_size),
+
+	TP_STRUCT__entry(
+		__field(	int,	dev_major		)
+		__field(	int,	dev_minor		)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	new_size		)
+	),
+
+	TP_fast_assign(
+		__entry->dev_major	= MAJOR(inode->i_sb->s_dev);
+		__entry->dev_minor	= MINOR(inode->i_sb->s_dev);
+		__entry->ino		= inode->i_ino;
+		__entry->new_size	= new_size;
+	),
+
+	TP_printk("dev %d,%d ino %lu new_size %lld",
+		  __entry->dev_major, __entry->dev_minor,
+		  (unsigned long) __entry->ino,
+		  (long long) __entry->new_size)
+);
+
 DECLARE_EVENT_CLASS(ext4__write_begin,
 
 	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
-- 
cgit v0.10.2