From ce7231e92dac381f6e4f9cfdfdf9e0ea055223ad Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 13 May 2008 13:45:14 -0700
Subject: [PATCH 1/2] ocfs2: Add CONFIG_OCFS2_FS_STATS config option

This patch adds config option CONFIG_OCFS2_FS_STATS to allow building
the fs with instrumentation enabled. An upcoming patch will provide
support to instrument cluster locking, which is a crucial overhead in
a cluster file system. This config option allows users to avoid the cpu
and memory overhead that is involved in gathering such statistics.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648..9c07a4f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
 	  It is safe to say Y, as the clustering method is run-time
 	  selectable.
 
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
 config OCFS2_DEBUG_MASKLOG
 	bool "OCFS2 logging support"
 	depends on OCFS2_FS
-- 
cgit v0.10.2


From 8ddb7b004dfa1832a750e199df8bff4b75b73565 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 13 May 2008 13:45:15 -0700
Subject: [PATCH 2/2] ocfs2: Instrument fs cluster locks

This patch adds code to track the number of times the fs takes
various cluster locks as well as the times associated with it.
The information is made available to users via debugfs.

This patch was originally written by Jan Kara <jack@suse.cz>.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 80e20d9..80537b7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/time.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
 	struct completion	mw_complete;
 	unsigned long		mw_mask;
 	unsigned long		mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+	unsigned long long 	mw_lock_start;
+#endif
 };
 
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
 	spin_unlock(&ocfs2_dlm_tracking_lock);
 }
 
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+	res->l_lock_num_prmode = 0;
+	res->l_lock_num_prmode_failed = 0;
+	res->l_lock_total_prmode = 0;
+	res->l_lock_max_prmode = 0;
+	res->l_lock_num_exmode = 0;
+	res->l_lock_num_exmode_failed = 0;
+	res->l_lock_total_exmode = 0;
+	res->l_lock_max_exmode = 0;
+	res->l_lock_refresh = 0;
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+				    struct ocfs2_mask_waiter *mw, int ret)
+{
+	unsigned long long *num, *sum;
+	unsigned int *max, *failed;
+	struct timespec ts = current_kernel_time();
+	unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+
+	if (level == LKM_PRMODE) {
+		num = &res->l_lock_num_prmode;
+		sum = &res->l_lock_total_prmode;
+		max = &res->l_lock_max_prmode;
+		failed = &res->l_lock_num_prmode_failed;
+	} else if (level == LKM_EXMODE) {
+		num = &res->l_lock_num_exmode;
+		sum = &res->l_lock_total_exmode;
+		max = &res->l_lock_max_exmode;
+		failed = &res->l_lock_num_exmode_failed;
+	} else
+		return;
+
+	(*num)++;
+	(*sum) += time;
+	if (time > *max)
+		*max = time;
+	if (ret)
+		(*failed)++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+	lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+	struct timespec ts = current_kernel_time();
+	mw->mw_lock_start = timespec_to_ns(&ts);
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+			   int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *res,
 				       enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
 	res->l_flags         = OCFS2_LOCK_INITIALIZED;
 
 	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+	ocfs2_init_lock_stats(res);
 }
 
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
 {
 	INIT_LIST_HEAD(&mw->mw_item);
 	init_completion(&mw->mw_complete);
+	ocfs2_init_start_time(mw);
 }
 
 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
 			goto again;
 		mlog_errno(ret);
 	}
+	ocfs2_update_lock_stats(lockres, level, &mw, ret);
 
 	mlog_exit(ret);
 	return ret;
@@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 				le32_to_cpu(fe->i_flags));
 
 		ocfs2_refresh_inode(inode, fe);
+		ocfs2_track_lock_refresh(lockres);
 	}
 
 	status = 0;
@@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 
 		if (status < 0)
 			mlog_errno(status);
+		ocfs2_track_lock_refresh(lockres);
 	}
 bail:
 	mlog_exit(status);
@@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
 }
 
 /* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 1
+#define OCFS2_DLM_DEBUG_STR_VERSION 2
 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 {
 	int i;
@@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 	for(i = 0; i < DLM_LVB_LEN; i++)
 		seq_printf(m, "0x%x\t", lvb[i]);
 
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l)		(_l)->l_lock_num_prmode
+# define lock_num_exmode(_l)		(_l)->l_lock_num_exmode
+# define lock_num_prmode_failed(_l)	(_l)->l_lock_num_prmode_failed
+# define lock_num_exmode_failed(_l)	(_l)->l_lock_num_exmode_failed
+# define lock_total_prmode(_l)		(_l)->l_lock_total_prmode
+# define lock_total_exmode(_l)		(_l)->l_lock_total_exmode
+# define lock_max_prmode(_l)		(_l)->l_lock_max_prmode
+# define lock_max_exmode(_l)		(_l)->l_lock_max_exmode
+# define lock_refresh(_l)		(_l)->l_lock_refresh
+#else
+# define lock_num_prmode(_l)		(0)
+# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode_failed(_l)	(0)
+# define lock_num_exmode_failed(_l)	(0)
+# define lock_total_prmode(_l)		(0)
+# define lock_total_exmode(_l)		(0)
+# define lock_max_prmode(_l)		(0)
+# define lock_max_exmode(_l)		(0)
+# define lock_refresh(_l)		(0)
+#endif
+	/* The following seq_print was added in version 2 of this output */
+	seq_printf(m, "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%llu\t"
+		   "%llu\t"
+		   "%u\t"
+		   "%u\t"
+		   "%u\t",
+		   lock_num_prmode(lockres),
+		   lock_num_exmode(lockres),
+		   lock_num_prmode_failed(lockres),
+		   lock_num_exmode_failed(lockres),
+		   lock_total_prmode(lockres),
+		   lock_total_exmode(lockres),
+		   lock_max_prmode(lockres),
+		   lock_max_exmode(lockres),
+		   lock_refresh(lockres));
+
 	/* End the line */
 	seq_printf(m, "\n");
 	return 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3169237..1cb814b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -132,6 +132,18 @@ struct ocfs2_lock_res {
 	wait_queue_head_t        l_event;
 
 	struct list_head         l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+	unsigned long long	 l_lock_num_prmode; 	   /* PR acquires */
+	unsigned long long 	 l_lock_num_exmode; 	   /* EX acquires */
+	unsigned int		 l_lock_num_prmode_failed; /* Failed PR gets */
+	unsigned int		 l_lock_num_exmode_failed; /* Failed EX gets */
+	unsigned long long	 l_lock_total_prmode; 	   /* Tot wait for PR */
+	unsigned long long	 l_lock_total_exmode; 	   /* Tot wait for EX */
+	unsigned int		 l_lock_max_prmode; 	   /* Max wait for PR */
+	unsigned int		 l_lock_max_exmode; 	   /* Max wait for EX */
+	unsigned int		 l_lock_refresh;	   /* Disk refreshes */
+#endif
 };
 
 struct ocfs2_dlm_debug {
-- 
cgit v0.10.2


From dd25e55ea133b14678cfaa9e205b082b24b26dbc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 28 May 2008 14:41:00 -0700
Subject: ocfs2: fix printk format warnings with OCFS2_FS_STATS=n

Fix printk format warnings when OCFS2_FS_STATS=n:

linux-next-20080528/fs/ocfs2/dlmglue.c: In function 'ocfs2_dlm_seq_show':
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 3 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 4 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 7 has type 'int'
linux-next-20080528/fs/ocfs2/dlmglue.c:2623: warning: format '%llu' expects type 'long long unsigned int', but argument 8 has type 'int'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 80537b7..eae3d64 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2592,12 +2592,12 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 # define lock_max_exmode(_l)		(_l)->l_lock_max_exmode
 # define lock_refresh(_l)		(_l)->l_lock_refresh
 #else
-# define lock_num_prmode(_l)		(0)
-# define lock_num_exmode(_l)		(0)
+# define lock_num_prmode(_l)		(0ULL)
+# define lock_num_exmode(_l)		(0ULL)
 # define lock_num_prmode_failed(_l)	(0)
 # define lock_num_exmode_failed(_l)	(0)
-# define lock_total_prmode(_l)		(0)
-# define lock_total_exmode(_l)		(0)
+# define lock_total_prmode(_l)		(0ULL)
+# define lock_total_exmode(_l)		(0ULL)
 # define lock_max_prmode(_l)		(0)
 # define lock_max_exmode(_l)		(0)
 # define lock_refresh(_l)		(0)
-- 
cgit v0.10.2


From 7600c72b75bab374ad39b2a4799a0728579a8e2f Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 9 Jun 2008 16:34:23 -0700
Subject: ocfs2: use simple_read_from_buffer()

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280..24e0b19 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -549,26 +549,17 @@ static ssize_t ocfs2_control_read(struct file *file,
 				  size_t count,
 				  loff_t *ppos)
 {
-	char *proto_string = OCFS2_CONTROL_PROTO;
-	size_t to_write = 0;
-
-	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
-		return 0;
-
-	to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
-	if (to_write > count)
-		to_write = count;
-	if (copy_to_user(buf, proto_string + *ppos, to_write))
-		return -EFAULT;
+	ssize_t ret;
 
-	*ppos += to_write;
+	ret = simple_read_from_buffer(buf, count, ppos,
+			OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
 
 	/* Have we read the whole protocol list? */
-	if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+	if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
 		ocfs2_control_set_handshake_state(file,
 						  OCFS2_CONTROL_HANDSHAKE_READ);
 
-	return to_write;
+	return ret;
 }
 
 static int ocfs2_control_release(struct inode *inode, struct file *file)
-- 
cgit v0.10.2


From 56753bd3b9220f6f2477eb1cf97f40c24e0a4c91 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 9 Jun 2008 11:24:41 -0700
Subject: ocfs2: Silence an error message in ocfs2_file_aio_read()

This patch silences an EINVAL error message in ocfs2_file_aio_read()
that is always due to a user error.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30..e8514e8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
-		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+		mlog(0, "generic_file_aio_read returned -EINVAL\n");
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
-- 
cgit v0.10.2


From 01af482037d32c215aab208a0b110ffe6fd782c0 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Tue, 10 Jun 2008 14:24:48 +0800
Subject: ocfs2: Handle error during journal load

This patch ensures the mount fails if the fs is unable to load the journal.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Acked-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba2..ccecfe5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 	local = ocfs2_mount_local(osb);
 
 	/* will play back anything left in the journal. */
-	ocfs2_journal_load(osb->journal, local);
+	status = ocfs2_journal_load(osb->journal, local);
+	if (status < 0) {
+		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+		goto finally;
+	}
 
 	if (dirty) {
 		/* recover my local alloc if we didn't unmount cleanly. */
-- 
cgit v0.10.2


From 461c6a30eca6f25add1dadb9fd8a1d8e89a6e627 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Mon, 19 May 2008 16:23:37 -0700
Subject: ocfs2/net: Silence build warnings on sparc64

suseconds_t is type long on most arches except sparc64 where it is type int.
This patch silences the following warnings that are generated when building
on it.

netdebug.c: In function 'nst_seq_show':
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 13 has type 'suseconds_t'
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 15 has type 'suseconds_t'
netdebug.c:152: warning: format '%lu' expects type 'long unsigned int', but argument 17 has type 'suseconds_t'
netdebug.c: In function 'sc_seq_show':
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 19 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 21 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 23 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 25 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 27 has type 'suseconds_t'
netdebug.c:332: warning: format '%lu' expects type 'long unsigned int', but argument 29 has type 'suseconds_t'

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0e..d8bfa0e 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v)
 			   nst->st_task->comm, nst->st_node,
 			   nst->st_sc, nst->st_id, nst->st_msg_type,
 			   nst->st_msg_key,
-			   nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
-			   nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+			   nst->st_sock_time.tv_sec,
+			   (unsigned long)nst->st_sock_time.tv_usec,
+			   nst->st_send_time.tv_sec,
+			   (unsigned long)nst->st_send_time.tv_usec,
 			   nst->st_status_time.tv_sec,
 			   nst->st_status_time.tv_usec);
 	}
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
-- 
cgit v0.10.2


From e407e39783a7206d20b3e9961aedf272de966e31 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 22:35:39 -0700
Subject: ocfs2: Fix CONFIG_OCFS2_DEBUG_FS #ifdefs

A couple places use OCFS2_DEBUG_FS where they really mean
CONFIG_OCFS2_DEBUG_FS.

Reported-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338..a8c19cb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 
 	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
 
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
 	status = journal_extend(handle, nblocks);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bd..28e492e 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
 	    ocfs2_local_alloc_count_bits(alloc)) {
 		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-- 
cgit v0.10.2


From fe9f387740ac7cb3b7c2fffa76807e997e6c6292 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 22:39:18 -0700
Subject: ocfs2: Don't snprintf() without a format.

Some system files are per-slot.  Their names include the slot number.
ocfs2_sprintf_system_inode_name() uses the system inode definitions to
fill in the slot number with snprintf().

For global system files, there is no node number, and the name was
printed as a format with no arguments.  -Wformat-nonliteral and
-Wformat-security don't like this.  Instead, use a static "%s" format
and the name as the argument.

Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c4266..3f19451 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
          * list has a copy per slot.
          */
 	if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
-		chars = snprintf(buf, len,
+		chars = snprintf(buf, len, "%s",
 				 ocfs2_system_inodes[type].si_name);
 	else
 		chars = snprintf(buf, len,
-- 
cgit v0.10.2


From 6f61076406251626be39651d114fac412b1e0c39 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:00:58 +0200
Subject: configfs: Introduce configfs_dirent_lock

This patch introduces configfs_dirent_lock spinlock to protect configfs_dirent
traversals against linkage mutations (add/del/move). This will allow
configfs_detach_prep() to avoid locking i_mutexes.

Locking rules for configfs_dirent linkage mutations are the same plus the
requirement of taking configfs_dirent_lock. For configfs_dirent walking, one can
either take appropriate i_mutex as before, or take configfs_dirent_lock.

The spinlock could actually be a mutex, but the critical sections are either
O(1) or should not be too long (default groups walking in last patch).

ChangeLog:
  - Clarify the comment on configfs_dirent_lock usage
  - Move sd->s_element init before linking the new dirent
  - In lseek(), do not release configfs_dirent_lock before the dirent is
    relinked.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca9860..5a33b58 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
 
 #include <linux/slab.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 
 struct configfs_dirent {
 	atomic_t		s_count;
@@ -49,6 +50,8 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DROPPING	0x0100
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
+extern spinlock_t configfs_dirent_lock;
+
 extern struct vfsmount * configfs_mount;
 extern struct kmem_cache *configfs_dir_cachep;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7d..2619f48 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -35,6 +35,14 @@
 #include "configfs_internal.h"
 
 DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees without having to
+ * lock inodes.
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
 
 static void configfs_d_iput(struct dentry * dentry,
 			    struct inode * inode)
@@ -79,8 +87,10 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 	atomic_set(&sd->s_count, 1);
 	INIT_LIST_HEAD(&sd->s_links);
 	INIT_LIST_HEAD(&sd->s_children);
-	list_add(&sd->s_sibling, &parent_sd->s_children);
 	sd->s_element = element;
+	spin_lock(&configfs_dirent_lock);
+	list_add(&sd->s_sibling, &parent_sd->s_children);
+	spin_unlock(&configfs_dirent_lock);
 
 	return sd;
 }
@@ -173,7 +183,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
 			if (sd) {
+				spin_lock(&configfs_dirent_lock);
 				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
 				configfs_put(sd);
 			}
 		}
@@ -224,7 +236,9 @@ int configfs_create_link(struct configfs_symlink *sl,
 		else {
 			struct configfs_dirent *sd = dentry->d_fsdata;
 			if (sd) {
+				spin_lock(&configfs_dirent_lock);
 				list_del_init(&sd->s_sibling);
+				spin_unlock(&configfs_dirent_lock);
 				configfs_put(sd);
 			}
 		}
@@ -238,7 +252,9 @@ static void remove_dir(struct dentry * d)
 	struct configfs_dirent * sd;
 
 	sd = d->d_fsdata;
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	configfs_put(sd);
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
@@ -410,7 +426,9 @@ static void detach_attrs(struct config_item * item)
 	list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
 			continue;
+		spin_lock(&configfs_dirent_lock);
 		list_del_init(&sd->s_sibling);
+		spin_unlock(&configfs_dirent_lock);
 		configfs_drop_dentry(sd, dentry);
 		configfs_put(sd);
 	}
@@ -1268,7 +1286,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct configfs_dirent * cursor = file->private_data;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&cursor->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	release_configfs_dirent(cursor);
@@ -1308,7 +1328,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 			/* fallthrough */
 		default:
 			if (filp->f_pos == 2) {
+				spin_lock(&configfs_dirent_lock);
 				list_move(q, &parent_sd->s_children);
+				spin_unlock(&configfs_dirent_lock);
 			}
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct configfs_dirent *next;
@@ -1331,7 +1353,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 						 dt_type(next)) < 0)
 					return 0;
 
+				spin_lock(&configfs_dirent_lock);
 				list_move(q, p);
+				spin_unlock(&configfs_dirent_lock);
 				p = q;
 				filp->f_pos++;
 			}
@@ -1362,6 +1386,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 			struct list_head *p;
 			loff_t n = file->f_pos - 2;
 
+			spin_lock(&configfs_dirent_lock);
 			list_del(&cursor->s_sibling);
 			p = sd->s_children.next;
 			while (n && p != &sd->s_children) {
@@ -1373,6 +1398,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 				p = p->next;
 			}
 			list_add_tail(&cursor->s_sibling, p);
+			spin_unlock(&configfs_dirent_lock);
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d81..4803ccc 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 		if (!sd->s_element)
 			continue;
 		if (!strcmp(configfs_get_name(sd), name)) {
+			spin_lock(&configfs_dirent_lock);
 			list_del_init(&sd->s_sibling);
+			spin_unlock(&configfs_dirent_lock);
 			configfs_drop_dentry(sd, dir);
 			configfs_put(sd);
 			break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef..676c84c 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -169,7 +169,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
 
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sd->s_sibling);
+	spin_unlock(&configfs_dirent_lock);
 	configfs_drop_dentry(sd, dentry->d_parent);
 	dput(dentry);
 	configfs_put(sd);
-- 
cgit v0.10.2


From 5301a77da2da1e4c22573e0e8d394a653b8ad9f9 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:00:59 +0200
Subject: configfs: Protect configfs_dirent s_links list mutations

Symlinks to a config_item are listed under its configfs_dirent s_links, but the
list mutations are not protected by any common lock.

This patch uses the configfs_dirent_lock spinlock to add the necessary
protection.

Note: we should also protect the list_empty() test in configfs_detach_prep() but
1/ the lock should not be released immediately because nothing would prevent the
list from being filled after a successful list_empty() test, making the problem
tricky,
2/ this will be solved by the rmdir() vs rename() deadlock bugfix.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 2619f48..a08e5c2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -37,10 +37,11 @@
 DECLARE_RWSEM(configfs_rename_sem);
 /*
  * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
  * Mutators of configfs_dirent linkage must *both* have the proper inode locked
  * and configfs_dirent_lock locked, in that order.
- * This allows one to safely traverse configfs_dirent trees without having to
- * lock inodes.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
  */
 DEFINE_SPINLOCK(configfs_dirent_lock);
 
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 676c84c..faeb441 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item,
 	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
 	if (sl) {
 		sl->sl_target = config_item_get(item);
-		/* FIXME: needs a lock, I'd bet */
+		spin_lock(&configfs_dirent_lock);
 		list_add(&sl->sl_list, &target_sd->s_links);
+		spin_unlock(&configfs_dirent_lock);
 		ret = configfs_create_link(sl, parent_item->ci_dentry,
 					   dentry);
 		if (ret) {
+			spin_lock(&configfs_dirent_lock);
 			list_del_init(&sl->sl_list);
+			spin_unlock(&configfs_dirent_lock);
 			config_item_put(item);
 			kfree(sl);
 		}
@@ -186,8 +189,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
 		type->ct_item_ops->drop_link(parent_item,
 					       sl->sl_target);
 
-	/* FIXME: Needs lock */
+	spin_lock(&configfs_dirent_lock);
 	list_del_init(&sl->sl_list);
+	spin_unlock(&configfs_dirent_lock);
 
 	/* Put reference from create_link() */
 	config_item_put(sl->sl_target);
-- 
cgit v0.10.2


From 107ed40bd070df5e4a0a012042c45c40963dc574 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:00 +0200
Subject: configfs: Make configfs_new_dirent() return error code instead of
 NULL

This patch makes configfs_new_dirent return negative error code instead of NULL,
which will be useful in the next patch to differentiate ENOMEM from ENOENT.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a08e5c2..918a332 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,6 +30,7 @@
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/err.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
@@ -83,7 +84,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 
 	sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
 	if (!sd)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	atomic_set(&sd->s_count, 1);
 	INIT_LIST_HEAD(&sd->s_links);
@@ -129,8 +130,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 	struct configfs_dirent * sd;
 
 	sd = configfs_new_dirent(parent_sd, element);
-	if (!sd)
-		return -ENOMEM;
+	if (IS_ERR(sd))
+		return PTR_ERR(sd);
 
 	sd->s_mode = mode;
 	sd->s_type = type;
@@ -1277,7 +1278,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	file->private_data = configfs_new_dirent(parent_sd, NULL);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
-	return file->private_data ? 0 : -ENOMEM;
+	return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
 
 }
 
-- 
cgit v0.10.2


From b3e76af87441fc36eef3516d73ab2314e7b2d911 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:01 +0200
Subject: configfs: Fix deadlock with racing rmdir() and rename()

This patch fixes the deadlock between racing sys_rename() and configfs_rmdir().

The idea is to avoid locking i_mutexes of default groups in
configfs_detach_prep(), and rely instead on the new configfs_dirent_lock to
protect against configfs_dirent's linkage mutations. To ensure that an mkdir()
racing with rmdir() will not create new items in a to-be-removed default group,
we make configfs_new_dirent() check for the CONFIGFS_USET_DROPPING flag right
before linking the new dirent, and return error if the flag is set. This makes
racing mkdir()/symlink()/dir_open() fail in places where errors could already
happen, resp. in (attach_item()|attach_group())/create_link()/new_dirent().

configfs_depend() remains safe since it locks all the path from configfs root,
and is thus mutually exclusive with rmdir().

An advantage of this is that now detach_groups() unconditionnaly takes the
default groups i_mutex, which makes it more consistent with populate_groups().

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 918a332..d5b5985 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -43,6 +43,10 @@ DECLARE_RWSEM(configfs_rename_sem);
  * and configfs_dirent_lock locked, in that order.
  * This allows one to safely traverse configfs_dirent trees and symlinks without
  * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
  */
 DEFINE_SPINLOCK(configfs_dirent_lock);
 
@@ -91,6 +95,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
 	INIT_LIST_HEAD(&sd->s_children);
 	sd->s_element = element;
 	spin_lock(&configfs_dirent_lock);
+	if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+		spin_unlock(&configfs_dirent_lock);
+		kmem_cache_free(configfs_dir_cachep, sd);
+		return ERR_PTR(-ENOENT);
+	}
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	spin_unlock(&configfs_dirent_lock);
 
@@ -349,11 +358,11 @@ static struct dentry * configfs_lookup(struct inode *dir,
 
 /*
  * Only subdirectories count here.  Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir().  We recurse, taking i_mutex
- * on all children that are candidates for default detach.  If the
- * result is clean, then configfs_detach_group() will handle dropping
- * i_mutex.  If there is an error, the caller will clean up the i_mutex
- * holders via configfs_detach_rollback().
+ * attributes and are removed by rmdir().  We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
  */
 static int configfs_detach_prep(struct dentry *dentry)
 {
@@ -370,8 +379,7 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
-			mutex_lock(&sd->s_dentry->d_inode->i_mutex);
-			/* Mark that we've taken i_mutex */
+			/* Mark that we're trying to drop the group */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
 			/*
@@ -392,7 +400,7 @@ out:
 }
 
 /*
- * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
  * set.
  */
 static void configfs_detach_rollback(struct dentry *dentry)
@@ -403,11 +411,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
 			configfs_detach_rollback(sd->s_dentry);
-
-			if (sd->s_type & CONFIGFS_USET_DROPPING) {
-				sd->s_type &= ~CONFIGFS_USET_DROPPING;
-				mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
-			}
+			sd->s_type &= ~CONFIGFS_USET_DROPPING;
 		}
 	}
 }
@@ -486,16 +490,12 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
+		mutex_lock(&child->d_inode->i_mutex);
+
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
 
-		/*
-		 * From rmdir/unregister, a configfs_detach_prep() pass
-		 * has taken our i_mutex for us.  Drop it.
-		 * From mkdir/register cleanup, there is no sem held.
-		 */
-		if (sd->s_type & CONFIGFS_USET_DROPPING)
-			mutex_unlock(&child->d_inode->i_mutex);
+		mutex_unlock(&child->d_inode->i_mutex);
 
 		d_delete(child);
 		dput(child);
@@ -1181,12 +1181,15 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EINVAL;
 	}
 
+	spin_lock(&configfs_dirent_lock);
 	ret = configfs_detach_prep(dentry);
 	if (ret) {
 		configfs_detach_rollback(dentry);
+		spin_unlock(&configfs_dirent_lock);
 		config_item_put(parent_item);
 		return ret;
 	}
+	spin_unlock(&configfs_dirent_lock);
 
 	/* Get a working ref for the duration of this function */
 	item = configfs_get_config_item(dentry);
@@ -1476,9 +1479,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
+	spin_unlock(&configfs_dirent_lock);
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
 	mutex_unlock(&dentry->d_inode->i_mutex);
-- 
cgit v0.10.2


From 6d8344baee99402de58b5fa5dfea197242955c15 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Mon, 16 Jun 2008 19:01:02 +0200
Subject: configfs: Fix failing mkdir() making racing rmdir() fail

When fixing the rename() vs rmdir() deadlock, we stopped locking default groups'
inodes in configfs_detach_prep(), letting racing mkdir() in default groups
proceed concurrently. This enables races like below happen, which leads to a
failing mkdir() making rmdir() fail, despite the group to remove having no
user-created directory under it in the end.

	process A: 			process B:
	/* PWD=A/B */
	mkdir("C")
	  make_item("C")
	  attach_group("C")
					rmdir("A")
					  detach_prep("A")
					    detach_prep("B")
					      error because of "C"
					  return -ENOTEMPTY
	    attach_group("C/D")
	      error (eg -ENOMEM)
	  return -ENOMEM

This patch prevents such scenarii by making rmdir() wait as long as
detach_prep() fails because a racing mkdir() is in the middle of attach_group().
To achieve this, mkdir() sets a flag CONFIGFS_USET_IN_MKDIR in parent's
configfs_dirent before calling attach_group(), and clears the flag once
attach_group() is done. detach_prep() fails with -EAGAIN whenever the flag is
hit and returns the guilty inode's mutex so that rmdir() can wait on it.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 5a33b58..da015c1 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -48,6 +48,7 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DIR	0x0040
 #define CONFIGFS_USET_DEFAULT	0x0080
 #define CONFIGFS_USET_DROPPING	0x0100
+#define CONFIGFS_USET_IN_MKDIR	0x0200
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
 extern spinlock_t configfs_dirent_lock;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d5b5985..614e382 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -364,7 +364,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
  * If there is an error, the caller will reset the flags via
  * configfs_detach_rollback().
  */
-static int configfs_detach_prep(struct dentry *dentry)
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
 {
 	struct configfs_dirent *parent_sd = dentry->d_fsdata;
 	struct configfs_dirent *sd;
@@ -379,6 +379,12 @@ static int configfs_detach_prep(struct dentry *dentry)
 		if (sd->s_type & CONFIGFS_NOT_PINNED)
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+			/* Abort if racing with mkdir() */
+			if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+				if (wait_mutex)
+					*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+				return -EAGAIN;
+			}
 			/* Mark that we're trying to drop the group */
 			sd->s_type |= CONFIGFS_USET_DROPPING;
 
@@ -386,7 +392,7 @@ static int configfs_detach_prep(struct dentry *dentry)
 			 * Yup, recursive.  If there's a problem, blame
 			 * deep nesting of default_groups
 			 */
-			ret = configfs_detach_prep(sd->s_dentry);
+			ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
 			if (!ret)
 				continue;
 		} else
@@ -1113,11 +1119,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	 */
 	module_got = 1;
 
+	/*
+	 * Make racing rmdir() fail if it did not tag parent with
+	 * CONFIGFS_USET_DROPPING
+	 * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+	 * fail and let rmdir() terminate correctly
+	 */
+	spin_lock(&configfs_dirent_lock);
+	/* This will make configfs_detach_prep() fail */
+	sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
 	if (group)
 		ret = configfs_attach_group(parent_item, item, dentry);
 	else
 		ret = configfs_attach_item(parent_item, item, dentry);
 
+	spin_lock(&configfs_dirent_lock);
+	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+	spin_unlock(&configfs_dirent_lock);
+
 out_unlink:
 	if (ret) {
 		/* Tear down everything we built up */
@@ -1182,13 +1203,25 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	spin_lock(&configfs_dirent_lock);
-	ret = configfs_detach_prep(dentry);
-	if (ret) {
-		configfs_detach_rollback(dentry);
-		spin_unlock(&configfs_dirent_lock);
-		config_item_put(parent_item);
-		return ret;
-	}
+	do {
+		struct mutex *wait_mutex;
+
+		ret = configfs_detach_prep(dentry, &wait_mutex);
+		if (ret) {
+			configfs_detach_rollback(dentry);
+			spin_unlock(&configfs_dirent_lock);
+			if (ret != -EAGAIN) {
+				config_item_put(parent_item);
+				return ret;
+			}
+
+			/* Wait until the racing operation terminates */
+			mutex_lock(wait_mutex);
+			mutex_unlock(wait_mutex);
+
+			spin_lock(&configfs_dirent_lock);
+		}
+	} while (ret == -EAGAIN);
 	spin_unlock(&configfs_dirent_lock);
 
 	/* Get a working ref for the duration of this function */
@@ -1480,7 +1513,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
 	spin_lock(&configfs_dirent_lock);
-	if (configfs_detach_prep(dentry)) {
+	if (configfs_detach_prep(dentry, NULL)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
 	spin_unlock(&configfs_dirent_lock);
-- 
cgit v0.10.2


From 11c3b79218390a139f2d474ee1e983a672d5839a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 12 Jun 2008 14:00:18 -0700
Subject: configfs: Allow ->make_item() and ->make_group() to return detailed
 errors.

The configfs operations ->make_item() and ->make_group() currently
return a new item/group.  A return of NULL signifies an error.  Because
of this, -ENOMEM is the only return code bubbled up the stack.

Multiple folks have requested the ability to return specific error codes
when these operations fail.  This patch adds that ability by changing the
->make_item/group() ops to return an int.

Also updated are the in-kernel users of configfs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index 44c97e6..15838d7 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -233,10 +233,12 @@ accomplished via the group operations specified on the group's
 config_item_type.
 
 	struct configfs_group_operations {
-		struct config_item *(*make_item)(struct config_group *group,
-						 const char *name);
-		struct config_group *(*make_group)(struct config_group *group,
-						   const char *name);
+		int (*make_item)(struct config_group *group,
+				 const char *name,
+				 struct config_item **new_item);
+		int (*make_group)(struct config_group *group,
+				  const char *name,
+				  struct config_group **new_group);
 		int (*commit_item)(struct config_item *item);
 		void (*disconnect_notify)(struct config_group *group,
 					  struct config_item *item);
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 25151fd..0b422ac 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -273,13 +273,13 @@ static inline struct simple_children *to_simple_children(struct config_item *ite
 	return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
 }
 
-static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
+static int simple_children_make_item(struct config_group *group, const char *name, struct config_item **new_item)
 {
 	struct simple_child *simple_child;
 
 	simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL);
 	if (!simple_child)
-		return NULL;
+		return -ENOMEM;
 
 
 	config_item_init_type_name(&simple_child->item, name,
@@ -287,7 +287,8 @@ static struct config_item *simple_children_make_item(struct config_group *group,
 
 	simple_child->storeme = 0;
 
-	return &simple_child->item;
+	*new_item = &simple_child->item;
+	return 0;
 }
 
 static struct configfs_attribute simple_children_attr_description = {
@@ -359,20 +360,21 @@ static struct configfs_subsystem simple_children_subsys = {
  * children of its own.
  */
 
-static struct config_group *group_children_make_group(struct config_group *group, const char *name)
+static int group_children_make_group(struct config_group *group, const char *name, struct config_group **new_group)
 {
 	struct simple_children *simple_children;
 
 	simple_children = kzalloc(sizeof(struct simple_children),
 				  GFP_KERNEL);
 	if (!simple_children)
-		return NULL;
+		return -ENOMEM;
 
 
 	config_group_init_type_name(&simple_children->group, name,
 				    &simple_children_type);
 
-	return &simple_children->group;
+	*new_group = &simple_children->group;
+	return 0;
 }
 
 static struct configfs_attribute group_children_attr_description = {
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 665341e..387a133 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -585,8 +585,9 @@ static struct config_item_type netconsole_target_type = {
  * Group operations and type for netconsole_subsys.
  */
 
-static struct config_item *make_netconsole_target(struct config_group *group,
-						  const char *name)
+static int make_netconsole_target(struct config_group *group,
+				  const char *name,
+				  struct config_item **new_item)
 {
 	unsigned long flags;
 	struct netconsole_target *nt;
@@ -598,7 +599,7 @@ static struct config_item *make_netconsole_target(struct config_group *group,
 	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt) {
 		printk(KERN_ERR "netconsole: failed to allocate memory\n");
-		return NULL;
+		return -ENOMEM;
 	}
 
 	nt->np.name = "netconsole";
@@ -615,7 +616,8 @@ static struct config_item *make_netconsole_target(struct config_group *group,
 	list_add(&nt->list, &target_list);
 	spin_unlock_irqrestore(&target_list_lock, flags);
 
-	return &nt->item;
+	*new_item = &nt->item;
+	return 0;
 }
 
 static void drop_netconsole_target(struct config_group *group,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 614e382..0e64312 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1073,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	group = NULL;
 	item = NULL;
 	if (type->ct_group_ops->make_group) {
-		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
-		if (group) {
+		ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
+		if (!ret) {
 			link_group(to_config_group(parent_item), group);
 			item = &group->cg_item;
 		}
 	} else {
-		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
-		if (item)
+		ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
+		if (!ret)
 			link_obj(parent_item, item);
 	}
 	mutex_unlock(&subsys->su_mutex);
 
 	kfree(name);
-	if (!item) {
+	if (ret) {
 		/*
-		 * If item == NULL, then link_obj() was never called.
+		 * If ret != 0, then link_obj() was never called.
 		 * There are no extra references to clean up.
 		 */
-		ret = -ENOMEM;
 		goto out_put;
 	}
 
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd..492d8ca 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,16 +41,20 @@ struct comm;
 struct nodes;
 struct node;
 
-static struct config_group *make_cluster(struct config_group *, const char *);
+static int make_cluster(struct config_group *, const char *,
+			struct config_group **);
 static void drop_cluster(struct config_group *, struct config_item *);
 static void release_cluster(struct config_item *);
-static struct config_group *make_space(struct config_group *, const char *);
+static int make_space(struct config_group *, const char *,
+		      struct config_group **);
 static void drop_space(struct config_group *, struct config_item *);
 static void release_space(struct config_item *);
-static struct config_item *make_comm(struct config_group *, const char *);
+static int make_comm(struct config_group *, const char *,
+		     struct config_item **);
 static void drop_comm(struct config_group *, struct config_item *);
 static void release_comm(struct config_item *);
-static struct config_item *make_node(struct config_group *, const char *);
+static int make_node(struct config_group *, const char *,
+		     struct config_item **);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
@@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i)
 	return i ? container_of(i, struct node, item) : NULL;
 }
 
-static struct config_group *make_cluster(struct config_group *g,
-					 const char *name)
+static int make_cluster(struct config_group *g, const char *name,
+			struct config_group **new_g)
 {
 	struct cluster *cl = NULL;
 	struct spaces *sps = NULL;
@@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g,
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
-	return &cl->group;
+	*new_g = &cl->group;
+	return 0;
 
  fail:
 	kfree(cl);
 	kfree(gps);
 	kfree(sps);
 	kfree(cms);
-	return NULL;
+	return -ENOMEM;
 }
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i)
 	kfree(cl);
 }
 
-static struct config_group *make_space(struct config_group *g, const char *name)
+static int make_space(struct config_group *g, const char *name,
+		      struct config_group **new_g)
 {
 	struct space *sp = NULL;
 	struct nodes *nds = NULL;
@@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	INIT_LIST_HEAD(&sp->members);
 	mutex_init(&sp->members_lock);
 	sp->members_count = 0;
-	return &sp->group;
+	*new_g = &sp->group;
+	return 0;
 
  fail:
 	kfree(sp);
 	kfree(gps);
 	kfree(nds);
-	return NULL;
+	return -ENOMEM;
 }
 
 static void drop_space(struct config_group *g, struct config_item *i)
@@ -522,19 +529,21 @@ static void release_space(struct config_item *i)
 	kfree(sp);
 }
 
-static struct config_item *make_comm(struct config_group *g, const char *name)
+static int make_comm(struct config_group *g, const char *name,
+		     struct config_item **new_i)
 {
 	struct comm *cm;
 
 	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
 	if (!cm)
-		return NULL;
+		return -ENOMEM;
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
 	cm->nodeid = -1;
 	cm->local = 0;
 	cm->addr_count = 0;
-	return &cm->item;
+	*new_i = &cm->item;
+	return 0;
 }
 
 static void drop_comm(struct config_group *g, struct config_item *i)
@@ -554,14 +563,15 @@ static void release_comm(struct config_item *i)
 	kfree(cm);
 }
 
-static struct config_item *make_node(struct config_group *g, const char *name)
+static int make_node(struct config_group *g, const char *name,
+		     struct config_item **new_i)
 {
 	struct space *sp = to_space(g->cg_item.ci_parent);
 	struct node *nd;
 
 	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
 	if (!nd)
-		return NULL;
+		return -ENOMEM;
 
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
@@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 	sp->members_count++;
 	mutex_unlock(&sp->members_lock);
 
-	return &nd->item;
+	*new_i = &nd->item;
+	return 0;
 }
 
 static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb3..443d108 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
-static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
-							  const char *name)
+static int o2hb_heartbeat_group_make_item(struct config_group *group,
+					  const char *name,
+					  struct config_item **new_item)
 {
 	struct o2hb_region *reg = NULL;
-	struct config_item *ret = NULL;
+	int ret = 0;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
-	if (reg == NULL)
-		goto out; /* ENOMEM */
+	if (reg == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
-	ret = &reg->hr_item;
+	*new_item = &reg->hr_item;
 
 	spin_lock(&o2hb_live_lock);
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
 out:
-	if (ret == NULL)
+	if (ret)
 		kfree(reg);
 
 	return ret;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b..b364b70 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,27 +644,32 @@ out:
 	return ret;
 }
 
-static struct config_item *o2nm_node_group_make_item(struct config_group *group,
-						     const char *name)
+static int o2nm_node_group_make_item(struct config_group *group,
+				     const char *name,
+				     struct config_item **new_item)
 {
 	struct o2nm_node *node = NULL;
-	struct config_item *ret = NULL;
+	int ret = 0;
 
-	if (strlen(name) > O2NM_MAX_NAME_LEN)
-		goto out; /* ENAMETOOLONG */
+	if (strlen(name) > O2NM_MAX_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto out;
+	}
 
 	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
-	if (node == NULL)
-		goto out; /* ENOMEM */
+	if (node == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
-	ret = &node->nd_item;
+	*new_item = &node->nd_item;
 
 out:
-	if (ret == NULL)
+	if (ret)
 		kfree(node);
 
 	return ret;
@@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
 }
 #endif
 
-static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
-							  const char *name)
+static int o2nm_cluster_group_make_group(struct config_group *group,
+					 const char *name,
+					 struct config_group **new_group)
 {
 	struct o2nm_cluster *cluster = NULL;
 	struct o2nm_node_group *ns = NULL;
-	struct config_group *o2hb_group = NULL, *ret = NULL;
+	struct config_group *o2hb_group = NULL;
 	void *defs = NULL;
+	int ret = 0;
 
 	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
-	if (o2nm_single_cluster)
-		goto out; /* ENOSPC */
+	if (o2nm_single_cluster) {
+		ret = -ENOSPC;
+		goto out;
+	}
 
 	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
 	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
 	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
 	o2hb_group = o2hb_alloc_hb_set();
-	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
 	config_group_init_type_name(&cluster->cl_group, name,
 				    &o2nm_cluster_type);
@@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
 	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
 	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
 
-	ret = &cluster->cl_group;
+	*new_group = &cluster->cl_group;
 	o2nm_single_cluster = cluster;
 
 out:
-	if (ret == NULL) {
+	if (ret) {
 		kfree(cluster);
 		kfree(ns);
 		o2hb_free_hb_set(o2hb_group);
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 3ae65b1..0488f93 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -165,8 +165,8 @@ struct configfs_item_operations {
 };
 
 struct configfs_group_operations {
-	struct config_item *(*make_item)(struct config_group *group, const char *name);
-	struct config_group *(*make_group)(struct config_group *group, const char *name);
+	int (*make_item)(struct config_group *group, const char *name, struct config_item **new_item);
+	int (*make_group)(struct config_group *group, const char *name, struct config_group **new_group);
 	int (*commit_item)(struct config_item *item);
 	void (*disconnect_notify)(struct config_group *group, struct config_item *item);
 	void (*drop_item)(struct config_group *group, struct config_item *item);
-- 
cgit v0.10.2


From e75206517504461778c283b942440ef312e437d5 Mon Sep 17 00:00:00 2001
From: Louis Rilling <Louis.Rilling@kerlabs.com>
Date: Thu, 12 Jun 2008 17:26:47 +0200
Subject: configfs: call drop_link() to cleanup after create_link() failure

When allow_link() succeeds but create_link() fails, the subsystem is not
informed of the failure.

This patch fixes this by calling drop_link() on create_link() failures.

Signed-off-by: Louis Rilling <Louis.Rilling@kerlabs.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index faeb441..0004d18 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -140,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 		goto out_put;
 
 	ret = type->ct_item_ops->allow_link(parent_item, target_item);
-	if (!ret)
+	if (!ret) {
 		ret = create_link(parent_item, target_item, dentry);
+		if (ret && type->ct_item_ops->drop_link)
+			type->ct_item_ops->drop_link(parent_item,
+						     target_item);
+	}
 
 	config_item_put(target_item);
 	path_put(&nd.path);
-- 
cgit v0.10.2


From c0420ad2ca514551ca086510b0e7d17a05c70492 Mon Sep 17 00:00:00 2001
From: Coly Li <coyli@suse.de>
Date: Mon, 30 Jun 2008 18:45:45 +0800
Subject: [PATCH] ocfs2: fix oops in mmap_truncate testing

This patch fixes a mmap_truncate bug which was found by ocfs2 test suite.

In an ocfs2 cluster more than 1 node, run program mmap_truncate, which races
mmap writes and truncates from multiple processes. While the test is
running, a stat from another node forces writeout, causing an oops in
ocfs2_get_block() because it sees a buffer to write which isn't allocated.

This patch fixed the bug by clear dirty and uptodate bits in buffer, leave
the buffer unmapped and return.

Fix is suggested by Mark Fasheh, and I code up the patch.

Signed-off-by: Coly Li <coyli@suse.de>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0..1db0801 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	 * need to use BH_New is when we're extending i_size on a file
 	 * system which doesn't support holes, in which case BH_New
 	 * allows block_prepare_write() to zero.
+	 *
+	 * If we see this on a sparse file system, then a truncate has
+	 * raced us and removed the cluster. In this case, we clear
+	 * the buffers dirty and uptodate bits and let the buffer code
+	 * ignore it as a hole.
 	 */
-	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
-			"ino %lu, iblock %llu\n", inode->i_ino,
-			(unsigned long long)iblock);
+	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+		clear_buffer_dirty(bh_result);
+		clear_buffer_uptodate(bh_result);
+		goto bail;
+	}
 
 	/* Treat the unwritten extent as a hole for zeroing purposes. */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
-- 
cgit v0.10.2