From ecfdb33d1fbc7e6e095ba24dac2930208494e734 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 19 Apr 2014 14:44:49 -0400
Subject: acct: encode_comp_t(0) is 0, fortunately...

There was an amusing bogosity in ac_rw calculation - it tried to
do encode_comp_t(encode_comp_t(0) / 1024).  Seeing that comp_t is
a 3-bit exponent + 13-bit mantissa... it's a good thing that 0 is
represented by all-bits-clear.

The history of that one is interesting - it was introduced in
2.1.68pre1, when acct.c had been reworked and moved to separate
file.  Two months later (2.1.86) somebody has noticed that the
sucker won't compile - there was no task_struct::io_usage.
At which point the ac_io calculation had changed from
encode_comp_t(current->io_usage) to encode_comp_t(0) and the
bug in the next line (absolutely real back then, had it ever
managed to compile) become a harmless bogosity.  Looks like
nobody has ever noticed until now.

Anyway, let's bury that idiocy now that it got noticed.  17 years
is long enough...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index a1844f1..807ebc5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -531,9 +531,6 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 	ac.ac_exitcode = pacct->ac_exitcode;
 	spin_unlock_irq(&current->sighand->siglock);
-	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
-	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
-	ac.ac_swaps = encode_comp_t(0);
 
 	/*
 	 * Get freeze protection. If the fs is frozen, just skip the write
-- 
cgit v0.10.2


From ed44724b79d8e03a40665436019cf22baba80d30 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 19 Apr 2014 14:37:20 -0400
Subject: acct: switch to __kernel_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/internal.h b/fs/internal.h
index 4657424..9a2edba 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 /*
  * read_write.c
  */
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e11d60c..4b7d57c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2335,6 +2335,7 @@ extern int do_pipe_flags(int *, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
+extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
diff --git a/kernel/acct.c b/kernel/acct.c
index 807ebc5..8082d98 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -456,12 +456,16 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
 	acct_t ac;
-	mm_segment_t fs;
 	unsigned long flim;
 	u64 elapsed, run_time;
 	struct tty_struct *tty;
 	const struct cred *orig_cred;
 
+	/*
+	 * Accounting records are not subject to resource limits.
+	 */
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	/* Perform file operations on behalf of whoever enabled accounting */
 	orig_cred = override_creds(file->f_cred);
 
@@ -536,25 +540,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	 * Get freeze protection. If the fs is frozen, just skip the write
 	 * as we could deadlock the system otherwise.
 	 */
-	if (!file_start_write_trylock(file))
-		goto out;
-	/*
-	 * Kernel segment override to datasegment and write it
-	 * to the accounting file.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-	/*
-	 * Accounting records are not subject to resource limits.
-	 */
-	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	file->f_op->write(file, (char *)&ac,
-			       sizeof(acct_t), &file->f_pos);
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
-	set_fs(fs);
-	file_end_write(file);
+	if (file_start_write_trylock(file)) {
+		/* it's been opened O_APPEND, so position is irrelevant */
+		loff_t pos = 0;
+		__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+		file_end_write(file);
+	}
 out:
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
 	revert_creds(orig_cred);
 }
 
-- 
cgit v0.10.2


From cdd37e23092c3c6fbbb2e611f8c3d18e676bf28f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Apr 2014 23:45:53 -0400
Subject: separate namespace-independent parts of filling acct_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 8082d98..efa891b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -448,42 +448,20 @@ static u32 encode_float(u64 value)
  *  do_exit() or when switching to a different output file.
  */
 
-/*
- *  do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
-		struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
-	acct_t ac;
-	unsigned long flim;
 	u64 elapsed, run_time;
 	struct tty_struct *tty;
-	const struct cred *orig_cred;
-
-	/*
-	 * Accounting records are not subject to resource limits.
-	 */
-	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-	/* Perform file operations on behalf of whoever enabled accounting */
-	orig_cred = override_creds(file->f_cred);
-
-	/*
-	 * First check to see if there is enough free_space to continue
-	 * the process accounting system.
-	 */
-	if (!check_free_space(acct, file))
-		goto out;
 
 	/*
 	 * Fill the accounting struct with the needed info as recorded
 	 * by the different kernel functions.
 	 */
-	memset(&ac, 0, sizeof(acct_t));
+	memset(ac, 0, sizeof(acct_t));
 
-	ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
-	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+	ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+	strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
 
 	/* calculate run_time in nsec*/
 	run_time = ktime_get_ns();
@@ -491,27 +469,66 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	/* convert nsec -> AHZ */
 	elapsed = nsec_to_AHZ(run_time);
 #if ACCT_VERSION==3
-	ac.ac_etime = encode_float(elapsed);
+	ac->ac_etime = encode_float(elapsed);
 #else
-	ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+	ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
 	                       (unsigned long) elapsed : (unsigned long) -1l);
 #endif
 #if ACCT_VERSION==1 || ACCT_VERSION==2
 	{
 		/* new enlarged etime field */
 		comp2_t etime = encode_comp2_t(elapsed);
-		ac.ac_etime_hi = etime >> 16;
-		ac.ac_etime_lo = (u16) etime;
+		ac->ac_etime_hi = etime >> 16;
+		ac->ac_etime_lo = (u16) etime;
 	}
 #endif
 	do_div(elapsed, AHZ);
-	ac.ac_btime = get_seconds() - elapsed;
+	ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+	ac->ac_ahz = AHZ;
+#endif
+
+	spin_lock_irq(&current->sighand->siglock);
+	tty = current->signal->tty;	/* Safe as we hold the siglock */
+	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+	ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+	ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+	ac->ac_flag = pacct->ac_flag;
+	ac->ac_mem = encode_comp_t(pacct->ac_mem);
+	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+	ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+	ac->ac_exitcode = pacct->ac_exitcode;
+	spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ *  do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct,
+		struct pid_namespace *ns, struct file *file)
+{
+	acct_t ac;
+	unsigned long flim;
+	const struct cred *orig_cred;
+
+	/*
+	 * Accounting records are not subject to resource limits.
+	 */
+	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+	/* Perform file operations on behalf of whoever enabled accounting */
+	orig_cred = override_creds(file->f_cred);
+
+	/*
+	 * First check to see if there is enough free_space to continue
+	 * the process accounting system.
+	 */
+	if (!check_free_space(acct, file))
+		goto out;
+
+	fill_ac(&ac);
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
 	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION==2
-	ac.ac_ahz = AHZ;
-#endif
 #if ACCT_VERSION==1 || ACCT_VERSION==2
 	/* backward-compatible 16 bit fields */
 	ac.ac_uid16 = ac.ac_uid;
@@ -523,19 +540,6 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
 	rcu_read_unlock();
 #endif
-
-	spin_lock_irq(&current->sighand->siglock);
-	tty = current->signal->tty;	/* Safe as we hold the siglock */
-	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
-	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
-	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
-	ac.ac_flag = pacct->ac_flag;
-	ac.ac_mem = encode_comp_t(pacct->ac_mem);
-	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
-	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
-	ac.ac_exitcode = pacct->ac_exitcode;
-	spin_unlock_irq(&current->sighand->siglock);
-
 	/*
 	 * Get freeze protection. If the fs is frozen, just skip the write
 	 * as we could deadlock the system otherwise.
-- 
cgit v0.10.2


From e25ff11ff16aba000dfe9e568d867e5142c31f16 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 May 2014 05:12:09 -0400
Subject: split the slow path in acct_process() off

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index efa891b..5118860 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -599,34 +599,35 @@ void acct_collect(long exitcode, int group_dead)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
 {
-	struct file *file = NULL;
-	struct bsd_acct_struct *acct;
+	for ( ; ns; ns = ns->parent) {
+		struct file *file = NULL;
+		struct bsd_acct_struct *acct;
 
-	acct = ns->bacct;
-	/*
-	 * accelerate the common fastpath:
-	 */
-	if (!acct || !acct->file)
-		return;
+		acct = ns->bacct;
+		/*
+		 * accelerate the common fastpath:
+		 */
+		if (!acct || !acct->file)
+			continue;
 
-	spin_lock(&acct_lock);
-	file = acct->file;
-	if (unlikely(!file)) {
+		spin_lock(&acct_lock);
+		file = acct->file;
+		if (unlikely(!file)) {
+			spin_unlock(&acct_lock);
+			continue;
+		}
+		get_file(file);
 		spin_unlock(&acct_lock);
-		return;
-	}
-	get_file(file);
-	spin_unlock(&acct_lock);
 
-	do_acct_process(acct, ns, file);
-	fput(file);
+		do_acct_process(acct, ns, file);
+		fput(file);
+	}
 }
 
 /**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
  *
  * handles process accounting for an exiting task
  */
@@ -639,6 +640,11 @@ void acct_process(void)
 	 * alive and holds its namespace, which in turn holds
 	 * its parent.
 	 */
-	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
-		acct_process_in_ns(ns);
+	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+		struct bsd_acct_struct *acct = ns->bacct;
+		if (acct && acct->file)
+			break;
+	}
+	if (unlikely(ns))
+		slow_acct_process(ns);
 }
-- 
cgit v0.10.2


From 795a2f22a8eaf749e20a11271a8821bf04ac6d90 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 May 2014 05:23:41 -0400
Subject: acct() should honour the limits from the very beginning

We need to check free space on the first write to freshly opened log.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 5118860..8777372 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -180,8 +180,8 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 	if (file) {
 		acct->file = file;
 		acct->ns = ns;
-		acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-		acct->active = 1;
+		acct->needcheck = jiffies;
+		acct->active = 0;
 		list_add(&acct->list, &acct_list);
 	}
 	if (old_acct) {
-- 
cgit v0.10.2


From 9df7fa16ee956bf0cdf4a711eac827be92d584bc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 May 2014 06:49:45 -0400
Subject: acct: serialize acct_on()

brute-force - on a global mutex that isn't nested into anything.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 8777372..08963a2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -241,6 +241,8 @@ static int acct_on(struct filename *pathname)
 	return 0;
 }
 
+static DEFINE_MUTEX(acct_on_mutex);
+
 /**
  * sys_acct - enable/disable process accounting
  * @name: file name for accounting records or NULL to shutdown accounting
@@ -263,7 +265,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		struct filename *tmp = getname(name);
 		if (IS_ERR(tmp))
 			return PTR_ERR(tmp);
+		mutex_lock(&acct_on_mutex);
 		error = acct_on(tmp);
+		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
 		struct bsd_acct_struct *acct;
-- 
cgit v0.10.2


From b8f00e6be46f4c9a112e05fd692712873c4c4048 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 07:51:03 -0400
Subject: acct: new lifetime rules

Do not reuse bsd_acct_struct after closing the damn thing.
Structure lifetime is controlled by refcount now.  We also
have a mutex in there, held over closing and writing (the
file is O_APPEND, so we are not losing any concurrency).

As the result, we do not need to bother with get_file()/fput()
on log write anymore.  Moreover, do_acct_process() only needs
acct itself; file and pidns are picked from it.

Killed instances are distinguished by having NULL ->ns.
Refcount is protected by acct_lock; anybody taking the
mutex needs to grab a reference first.

The things will get a lot simpler in the next commits - this
is just the minimal chunk switching to the new lifetime rules.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 08963a2..f9ef9db 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,15 +75,11 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct bsd_acct_struct *acct,
-		struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
 
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock.  This primes
- * the cache line to have the data after getting the lock.
- */
 struct bsd_acct_struct {
+	long			count;
+	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
 	struct file		*file;
@@ -157,39 +153,59 @@ out:
 	return res;
 }
 
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
-		struct pid_namespace *ns)
+static void acct_put(struct bsd_acct_struct *p)
 {
-	struct file *old_acct = NULL;
-	struct pid_namespace *old_ns = NULL;
-
-	if (acct->file) {
-		old_acct = acct->file;
-		old_ns = acct->ns;
-		acct->active = 0;
-		acct->file = NULL;
-		acct->ns = NULL;
-		list_del(&acct->list);
-	}
-	if (file) {
-		acct->file = file;
-		acct->ns = ns;
-		acct->needcheck = jiffies;
-		acct->active = 0;
-		list_add(&acct->list, &acct_list);
+	spin_lock(&acct_lock);
+	if (!--p->count)
+		kfree(p);
+	spin_unlock(&acct_lock);
+}
+
+static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p)
+{
+	struct bsd_acct_struct *res;
+	spin_lock(&acct_lock);
+again:
+	res = *p;
+	if (res)
+		res->count++;
+	spin_unlock(&acct_lock);
+	if (res) {
+		mutex_lock(&res->lock);
+		if (!res->ns) {
+			mutex_unlock(&res->lock);
+			spin_lock(&acct_lock);
+			if (!--res->count)
+				kfree(res);
+			goto again;
+		}
 	}
-	if (old_acct) {
-		mnt_unpin(old_acct->f_path.mnt);
+	return res;
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+		      struct bsd_acct_struct *new)
+{
+	if (acct) {
+		struct file *file = acct->file;
+		struct pid_namespace *ns = acct->ns;
+		spin_lock(&acct_lock);
+		list_del(&acct->list);
+		mnt_unpin(file->f_path.mnt);
 		spin_unlock(&acct_lock);
-		do_acct_process(acct, old_ns, old_acct);
-		filp_close(old_acct, NULL);
+		do_acct_process(acct);
+		filp_close(file, NULL);
 		spin_lock(&acct_lock);
+		ns->bacct = new;
+		if (new) {
+			mnt_pin(new->file->f_path.mnt);
+			list_add(&new->list, &acct_list);
+		}
+		acct->ns = NULL;
+		mutex_unlock(&acct->lock);
+		if (!(acct->count -= 2))
+			kfree(acct);
+		spin_unlock(&acct_lock);
 	}
 }
 
@@ -197,47 +213,50 @@ static int acct_on(struct filename *pathname)
 {
 	struct file *file;
 	struct vfsmount *mnt;
-	struct pid_namespace *ns;
-	struct bsd_acct_struct *acct = NULL;
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct bsd_acct_struct *acct, *old;
+
+	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+	if (!acct)
+		return -ENOMEM;
 
 	/* Difference from BSD - they don't do O_APPEND */
 	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-	if (IS_ERR(file))
+	if (IS_ERR(file)) {
+		kfree(acct);
 		return PTR_ERR(file);
+	}
 
 	if (!S_ISREG(file_inode(file)->i_mode)) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return -EACCES;
 	}
 
 	if (!file->f_op->write) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return -EIO;
 	}
 
-	ns = task_active_pid_ns(current);
-	if (ns->bacct == NULL) {
-		acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
-		if (acct == NULL) {
-			filp_close(file, NULL);
-			return -ENOMEM;
-		}
-	}
+	acct->count = 1;
+	acct->file = file;
+	acct->needcheck = jiffies;
+	acct->ns = ns;
+	mutex_init(&acct->lock);
+	mnt = file->f_path.mnt;
 
-	spin_lock(&acct_lock);
-	if (ns->bacct == NULL) {
+	old = acct_get(&ns->bacct);
+	if (old) {
+		acct_kill(old, acct);
+	} else {
+		spin_lock(&acct_lock);
 		ns->bacct = acct;
-		acct = NULL;
+		mnt_pin(mnt);
+		list_add(&acct->list, &acct_list);
+		spin_unlock(&acct_lock);
 	}
-
-	mnt = file->f_path.mnt;
-	mnt_pin(mnt);
-	acct_file_reopen(ns->bacct, file, ns);
-	spin_unlock(&acct_lock);
-
 	mntput(mnt); /* it's pinned, now give up active reference */
-	kfree(acct);
-
 	return 0;
 }
 
@@ -270,15 +289,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
-		struct bsd_acct_struct *acct;
-
-		acct = task_active_pid_ns(current)->bacct;
-		if (acct == NULL)
-			return 0;
-
-		spin_lock(&acct_lock);
-		acct_file_reopen(acct, NULL, NULL);
-		spin_unlock(&acct_lock);
+		acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL);
 	}
 
 	return error;
@@ -298,8 +309,19 @@ void acct_auto_close_mnt(struct vfsmount *m)
 	spin_lock(&acct_lock);
 restart:
 	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file && acct->file->f_path.mnt == m) {
-			acct_file_reopen(acct, NULL, NULL);
+		if (acct->file->f_path.mnt == m) {
+			acct->count++;
+			spin_unlock(&acct_lock);
+			mutex_lock(&acct->lock);
+			if (!acct->ns) {
+				mutex_unlock(&acct->lock);
+				spin_lock(&acct_lock);
+				if (!--acct->count)
+					kfree(acct);
+				goto restart;
+			}
+			acct_kill(acct, NULL);
+			spin_lock(&acct_lock);
 			goto restart;
 		}
 	spin_unlock(&acct_lock);
@@ -319,8 +341,19 @@ void acct_auto_close(struct super_block *sb)
 	spin_lock(&acct_lock);
 restart:
 	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
-			acct_file_reopen(acct, NULL, NULL);
+		if (acct->file->f_path.dentry->d_sb == sb) {
+			acct->count++;
+			spin_unlock(&acct_lock);
+			mutex_lock(&acct->lock);
+			if (!acct->ns) {
+				mutex_unlock(&acct->lock);
+				spin_lock(&acct_lock);
+				if (!--acct->count)
+					kfree(acct);
+				goto restart;
+			}
+			acct_kill(acct, NULL);
+			spin_lock(&acct_lock);
 			goto restart;
 		}
 	spin_unlock(&acct_lock);
@@ -328,17 +361,7 @@ restart:
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	struct bsd_acct_struct *acct = ns->bacct;
-
-	if (acct == NULL)
-		return;
-
-	spin_lock(&acct_lock);
-	if (acct->file != NULL)
-		acct_file_reopen(acct, NULL, NULL);
-	spin_unlock(&acct_lock);
-
-	kfree(acct);
+	acct_kill(acct_get(&ns->bacct), NULL);
 }
 
 /*
@@ -507,12 +530,13 @@ static void fill_ac(acct_t *ac)
 /*
  *  do_acct_process does all actual work. Caller holds the reference to file.
  */
-static void do_acct_process(struct bsd_acct_struct *acct,
-		struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct)
 {
 	acct_t ac;
 	unsigned long flim;
 	const struct cred *orig_cred;
+	struct pid_namespace *ns = acct->ns;
+	struct file *file = acct->file;
 
 	/*
 	 * Accounting records are not subject to resource limits.
@@ -606,27 +630,12 @@ void acct_collect(long exitcode, int group_dead)
 static void slow_acct_process(struct pid_namespace *ns)
 {
 	for ( ; ns; ns = ns->parent) {
-		struct file *file = NULL;
-		struct bsd_acct_struct *acct;
-
-		acct = ns->bacct;
-		/*
-		 * accelerate the common fastpath:
-		 */
-		if (!acct || !acct->file)
-			continue;
-
-		spin_lock(&acct_lock);
-		file = acct->file;
-		if (unlikely(!file)) {
-			spin_unlock(&acct_lock);
-			continue;
+		struct bsd_acct_struct *acct = acct_get(&ns->bacct);
+		if (acct) {
+			do_acct_process(acct);
+			mutex_unlock(&acct->lock);
+			acct_put(acct);
 		}
-		get_file(file);
-		spin_unlock(&acct_lock);
-
-		do_acct_process(acct, ns, file);
-		fput(file);
 	}
 }
 
@@ -645,8 +654,7 @@ void acct_process(void)
 	 * its parent.
 	 */
 	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
-		struct bsd_acct_struct *acct = ns->bacct;
-		if (acct && acct->file)
+		if (ns->bacct)
 			break;
 	}
 	if (unlikely(ns))
-- 
cgit v0.10.2


From 54a4d58a6459a93fc6ee898354b3d2ffb80dd03a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 19 Apr 2014 14:24:18 -0400
Subject: acct: simplify check_free_space()

a) file can't be NULL
b) file can't be changed under us
c) all writes are serialized by acct->lock; no need to mess with
spinlock there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index f9ef9db..019f012 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -93,64 +93,36 @@ static LIST_HEAD(acct_list);
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
 {
 	struct kstatfs sbuf;
-	int res;
-	int act;
-	u64 resume;
-	u64 suspend;
 
-	spin_lock(&acct_lock);
-	res = acct->active;
-	if (!file || time_is_before_jiffies(acct->needcheck))
+	if (time_is_before_jiffies(acct->needcheck))
 		goto out;
-	spin_unlock(&acct_lock);
 
 	/* May block */
-	if (vfs_statfs(&file->f_path, &sbuf))
-		return res;
-	suspend = sbuf.f_blocks * SUSPEND;
-	resume = sbuf.f_blocks * RESUME;
-
-	do_div(suspend, 100);
-	do_div(resume, 100);
-
-	if (sbuf.f_bavail <= suspend)
-		act = -1;
-	else if (sbuf.f_bavail >= resume)
-		act = 1;
-	else
-		act = 0;
-
-	/*
-	 * If some joker switched acct->file under us we'ld better be
-	 * silent and _not_ touch anything.
-	 */
-	spin_lock(&acct_lock);
-	if (file != acct->file) {
-		if (act)
-			res = act > 0;
+	if (vfs_statfs(&acct->file->f_path, &sbuf))
 		goto out;
-	}
 
 	if (acct->active) {
-		if (act < 0) {
+		u64 suspend = sbuf.f_blocks * SUSPEND;
+		do_div(suspend, 100);
+		if (sbuf.f_bavail <= suspend) {
 			acct->active = 0;
 			printk(KERN_INFO "Process accounting paused\n");
 		}
 	} else {
-		if (act > 0) {
+		u64 resume = sbuf.f_blocks * RESUME;
+		do_div(resume, 100);
+		if (sbuf.f_bavail >= resume) {
 			acct->active = 1;
 			printk(KERN_INFO "Process accounting resumed\n");
 		}
 	}
 
 	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-	res = acct->active;
 out:
-	spin_unlock(&acct_lock);
-	return res;
+	return acct->active;
 }
 
 static void acct_put(struct bsd_acct_struct *p)
@@ -550,7 +522,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	 * First check to see if there is enough free_space to continue
 	 * the process accounting system.
 	 */
-	if (!check_free_space(acct, file))
+	if (!check_free_space(acct))
 		goto out;
 
 	fill_ac(&ac);
-- 
cgit v0.10.2


From 215752fce31c80f3b3a1530bc7cddb3ba6a69b3a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 06:23:41 -0400
Subject: acct: get rid of acct_list

Put these suckers on per-vfsmount and per-superblock lists instead.
Note: right now it's still acct_lock for everything, but that's
going to change.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/mount.h b/fs/mount.h
index d55297f..0a2d145 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -56,6 +56,7 @@ struct mount {
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
+	struct hlist_head mnt_pins;
 	struct path mnt_ex_mountpoint;
 };
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41..22e530a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -956,7 +956,7 @@ put_again:
 		mnt->mnt_pinned = 0;
 		rcu_read_unlock();
 		unlock_mount_hash();
-		acct_auto_close_mnt(&mnt->mnt);
+		acct_auto_close_mnt(&mnt->mnt_pins);
 		goto put_again;
 	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
diff --git a/fs/super.c b/fs/super.c
index d20d5b1..52ed93e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -703,7 +703,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 #endif
 
 	if (flags & MS_RDONLY)
-		acct_auto_close(sb);
+		acct_auto_close(&sb->s_pins);
 	shrink_dcache_sb(sb);
 
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 4a5b7cb..65a4f88 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -24,14 +24,14 @@ struct super_block;
 struct pacct_struct;
 struct pid_namespace;
 extern int acct_parm[]; /* for sysctl */
-extern void acct_auto_close_mnt(struct vfsmount *m);
-extern void acct_auto_close(struct super_block *sb);
+extern void acct_auto_close(struct hlist_head *);
+extern void acct_auto_close_mnt(struct hlist_head *);
 extern void acct_collect(long exitcode, int group_dead);
 extern void acct_process(void);
 extern void acct_exit_ns(struct pid_namespace *);
 #else
-#define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_auto_close(x)	do { } while (0)
+#define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_collect(x,y)	do { } while (0)
 #define acct_process()		do { } while (0)
 #define acct_exit_ns(ns)	do { } while (0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b7d57c..17f7087 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1250,6 +1250,7 @@ struct super_block {
 
 	/* AIO completions deferred from interrupt context */
 	struct workqueue_struct *s_dio_done_wq;
+	struct hlist_head s_pins;
 
 	/*
 	 * Keep the lru lists last in the structure so they always sit on their
diff --git a/kernel/acct.c b/kernel/acct.c
index 019f012..21fbb3c 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
+#include <../fs/mount.h>	/* will go away when we refactor */
 
 /*
  * These constants control the amount of freespace that suspend and
@@ -79,16 +80,16 @@ static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
 	long			count;
+	struct hlist_node	s_list;
+	struct hlist_node	m_list;
 	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
 	struct file		*file;
 	struct pid_namespace	*ns;
-	struct list_head	list;
 };
 
 static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
 
 /*
  * Check the amount of free space and suspend/resume accordingly.
@@ -133,25 +134,33 @@ static void acct_put(struct bsd_acct_struct *p)
 	spin_unlock(&acct_lock);
 }
 
-static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p)
+static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
+{
+	res->count++;
+	spin_unlock(&acct_lock);
+	mutex_lock(&res->lock);
+	if (!res->ns) {
+		mutex_unlock(&res->lock);
+		spin_lock(&acct_lock);
+		if (!--res->count)
+			kfree(res);
+		return NULL;
+	}
+	return res;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
 	spin_lock(&acct_lock);
 again:
-	res = *p;
-	if (res)
-		res->count++;
-	spin_unlock(&acct_lock);
-	if (res) {
-		mutex_lock(&res->lock);
-		if (!res->ns) {
-			mutex_unlock(&res->lock);
-			spin_lock(&acct_lock);
-			if (!--res->count)
-				kfree(res);
-			goto again;
-		}
+	if (!ns->bacct) {
+		spin_unlock(&acct_lock);
+		return NULL;
 	}
+	res = __acct_get(ns->bacct);
+	if (!res)
+		goto again;
 	return res;
 }
 
@@ -162,7 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		struct file *file = acct->file;
 		struct pid_namespace *ns = acct->ns;
 		spin_lock(&acct_lock);
-		list_del(&acct->list);
+		hlist_del(&acct->m_list);
+		hlist_del(&acct->s_list);
 		mnt_unpin(file->f_path.mnt);
 		spin_unlock(&acct_lock);
 		do_acct_process(acct);
@@ -170,8 +180,10 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		spin_lock(&acct_lock);
 		ns->bacct = new;
 		if (new) {
-			mnt_pin(new->file->f_path.mnt);
-			list_add(&new->list, &acct_list);
+			struct vfsmount *m = new->file->f_path.mnt;
+			mnt_pin(m);
+			hlist_add_head(&new->s_list, &m->mnt_sb->s_pins);
+			hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins);
 		}
 		acct->ns = NULL;
 		mutex_unlock(&acct->lock);
@@ -218,14 +230,15 @@ static int acct_on(struct filename *pathname)
 	mutex_init(&acct->lock);
 	mnt = file->f_path.mnt;
 
-	old = acct_get(&ns->bacct);
+	old = acct_get(ns);
 	if (old) {
 		acct_kill(old, acct);
 	} else {
 		spin_lock(&acct_lock);
 		ns->bacct = acct;
 		mnt_pin(mnt);
-		list_add(&acct->list, &acct_list);
+		hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins);
+		hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins);
 		spin_unlock(&acct_lock);
 	}
 	mntput(mnt); /* it's pinned, now give up active reference */
@@ -261,79 +274,41 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		mutex_unlock(&acct_on_mutex);
 		putname(tmp);
 	} else {
-		acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL);
+		acct_kill(acct_get(task_active_pid_ns(current)), NULL);
 	}
 
 	return error;
 }
 
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off.  Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
+void acct_auto_close_mnt(struct hlist_head *list)
 {
-	struct bsd_acct_struct *acct;
-
-	spin_lock(&acct_lock);
-restart:
-	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file->f_path.mnt == m) {
-			acct->count++;
-			spin_unlock(&acct_lock);
-			mutex_lock(&acct->lock);
-			if (!acct->ns) {
-				mutex_unlock(&acct->lock);
-				spin_lock(&acct_lock);
-				if (!--acct->count)
-					kfree(acct);
-				goto restart;
-			}
-			acct_kill(acct, NULL);
-			spin_lock(&acct_lock);
-			goto restart;
-		}
+	while (1) {
+		spin_lock(&acct_lock);
+		if (!list->first)
+			break;
+		acct_kill(__acct_get(hlist_entry(list->first,
+						 struct bsd_acct_struct,
+						 m_list)), NULL);
+	}
 	spin_unlock(&acct_lock);
 }
 
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
+void acct_auto_close(struct hlist_head *list)
 {
-	struct bsd_acct_struct *acct;
-
-	spin_lock(&acct_lock);
-restart:
-	list_for_each_entry(acct, &acct_list, list)
-		if (acct->file->f_path.dentry->d_sb == sb) {
-			acct->count++;
-			spin_unlock(&acct_lock);
-			mutex_lock(&acct->lock);
-			if (!acct->ns) {
-				mutex_unlock(&acct->lock);
-				spin_lock(&acct_lock);
-				if (!--acct->count)
-					kfree(acct);
-				goto restart;
-			}
-			acct_kill(acct, NULL);
-			spin_lock(&acct_lock);
-			goto restart;
-		}
+	while (1) {
+		spin_lock(&acct_lock);
+		if (!list->first)
+			break;
+		acct_kill(__acct_get(hlist_entry(list->first,
+						 struct bsd_acct_struct,
+						 s_list)), NULL);
+	}
 	spin_unlock(&acct_lock);
 }
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	acct_kill(acct_get(&ns->bacct), NULL);
+	acct_kill(acct_get(ns), NULL);
 }
 
 /*
@@ -602,7 +577,7 @@ void acct_collect(long exitcode, int group_dead)
 static void slow_acct_process(struct pid_namespace *ns)
 {
 	for ( ; ns; ns = ns->parent) {
-		struct bsd_acct_struct *acct = acct_get(&ns->bacct);
+		struct bsd_acct_struct *acct = acct_get(ns);
 		if (acct) {
 			do_acct_process(acct);
 			mutex_unlock(&acct->lock);
-- 
cgit v0.10.2


From 2798d4ce61601808b965253d60624bbf201b51b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 07:04:28 -0400
Subject: acct: get rid of acct_lock for acct->count

* make acct->count atomic and acct freeing - rcu-delayed.
* instead of grabbing acct_lock around the places where we take a reference,
do that under rcu_read_lock() with atomic_long_inc_not_zero().
* have the new acct locked before making ns->bacct point to it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 21fbb3c..6fd375f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -79,9 +79,14 @@ int acct_parm[3] = {4, 2, 30};
 static void do_acct_process(struct bsd_acct_struct *acct);
 
 struct bsd_acct_struct {
-	long			count;
-	struct hlist_node	s_list;
-	struct hlist_node	m_list;
+	atomic_long_t		count;
+	union {
+		struct {
+			struct hlist_node	s_list;
+			struct hlist_node	m_list;
+		};
+		struct rcu_head rcu;
+	};
 	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
@@ -89,6 +94,11 @@ struct bsd_acct_struct {
 	struct pid_namespace	*ns;
 };
 
+static void acct_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct bsd_acct_struct, rcu));
+}
+
 static DEFINE_SPINLOCK(acct_lock);
 
 /*
@@ -128,22 +138,22 @@ out:
 
 static void acct_put(struct bsd_acct_struct *p)
 {
-	spin_lock(&acct_lock);
-	if (!--p->count)
-		kfree(p);
-	spin_unlock(&acct_lock);
+	if (atomic_long_dec_and_test(&p->count))
+		call_rcu(&p->rcu, acct_free_rcu);
 }
 
 static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
 {
-	res->count++;
-	spin_unlock(&acct_lock);
+	if (!atomic_long_inc_not_zero(&res->count)) {
+		rcu_read_unlock();
+		cpu_relax();
+		return NULL;
+	}
+	rcu_read_unlock();
 	mutex_lock(&res->lock);
 	if (!res->ns) {
 		mutex_unlock(&res->lock);
-		spin_lock(&acct_lock);
-		if (!--res->count)
-			kfree(res);
+		acct_put(res);
 		return NULL;
 	}
 	return res;
@@ -152,13 +162,15 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
-	spin_lock(&acct_lock);
 again:
-	if (!ns->bacct) {
-		spin_unlock(&acct_lock);
+	smp_rmb();
+	rcu_read_lock();
+	res = ACCESS_ONCE(ns->bacct);
+	if (!res) {
+		rcu_read_unlock();
 		return NULL;
 	}
-	res = __acct_get(ns->bacct);
+	res = __acct_get(res);
 	if (!res)
 		goto again;
 	return res;
@@ -170,26 +182,27 @@ static void acct_kill(struct bsd_acct_struct *acct,
 	if (acct) {
 		struct file *file = acct->file;
 		struct pid_namespace *ns = acct->ns;
+		do_acct_process(acct);
+		mnt_unpin(file->f_path.mnt);
+		filp_close(file, NULL);
 		spin_lock(&acct_lock);
 		hlist_del(&acct->m_list);
 		hlist_del(&acct->s_list);
-		mnt_unpin(file->f_path.mnt);
 		spin_unlock(&acct_lock);
-		do_acct_process(acct);
-		filp_close(file, NULL);
-		spin_lock(&acct_lock);
 		ns->bacct = new;
 		if (new) {
 			struct vfsmount *m = new->file->f_path.mnt;
 			mnt_pin(m);
+			spin_lock(&acct_lock);
 			hlist_add_head(&new->s_list, &m->mnt_sb->s_pins);
 			hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins);
+			spin_unlock(&acct_lock);
+			mutex_unlock(&new->lock);
 		}
 		acct->ns = NULL;
+		atomic_long_dec(&acct->count);
 		mutex_unlock(&acct->lock);
-		if (!(acct->count -= 2))
-			kfree(acct);
-		spin_unlock(&acct_lock);
+		acct_put(acct);
 	}
 }
 
@@ -223,7 +236,7 @@ static int acct_on(struct filename *pathname)
 		return -EIO;
 	}
 
-	acct->count = 1;
+	atomic_long_set(&acct->count, 1);
 	acct->file = file;
 	acct->needcheck = jiffies;
 	acct->ns = ns;
@@ -231,15 +244,17 @@ static int acct_on(struct filename *pathname)
 	mnt = file->f_path.mnt;
 
 	old = acct_get(ns);
+	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
 	if (old) {
 		acct_kill(old, acct);
 	} else {
-		spin_lock(&acct_lock);
 		ns->bacct = acct;
+		spin_lock(&acct_lock);
 		mnt_pin(mnt);
 		hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins);
 		hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins);
 		spin_unlock(&acct_lock);
+		mutex_unlock(&acct->lock);
 	}
 	mntput(mnt); /* it's pinned, now give up active reference */
 	return 0;
@@ -282,28 +297,32 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
 void acct_auto_close_mnt(struct hlist_head *list)
 {
+	rcu_read_lock();
 	while (1) {
-		spin_lock(&acct_lock);
-		if (!list->first)
+		struct hlist_node *p = ACCESS_ONCE(list->first);
+		if (!p)
 			break;
-		acct_kill(__acct_get(hlist_entry(list->first,
+		acct_kill(__acct_get(hlist_entry(p,
 						 struct bsd_acct_struct,
 						 m_list)), NULL);
+		rcu_read_lock();
 	}
-	spin_unlock(&acct_lock);
+	rcu_read_unlock();
 }
 
 void acct_auto_close(struct hlist_head *list)
 {
+	rcu_read_lock();
 	while (1) {
-		spin_lock(&acct_lock);
-		if (!list->first)
+		struct hlist_node *p = ACCESS_ONCE(list->first);
+		if (!p)
 			break;
-		acct_kill(__acct_get(hlist_entry(list->first,
+		acct_kill(__acct_get(hlist_entry(p,
 						 struct bsd_acct_struct,
 						 s_list)), NULL);
+		rcu_read_lock();
 	}
-	spin_unlock(&acct_lock);
+	rcu_read_unlock();
 }
 
 void acct_exit_ns(struct pid_namespace *ns)
-- 
cgit v0.10.2


From 0aec09d049d7e994eba54bad4376dd8f58eab797 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 07:32:06 -0400
Subject: drop ->s_umount around acct_auto_close()

just repeat the frozen check after regaining it, and check that sb
is still alive.  If several threads hit acct_auto_close() at the
same time, acct_auto_close() will survive that just fine.  And we
really don't want to play with writes and closing the file with
->s_umount held exclusive - it's a deadlock country.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/super.c b/fs/super.c
index 52ed93e..a369f89 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -702,12 +702,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		return -EACCES;
 #endif
 
-	if (flags & MS_RDONLY)
-		acct_auto_close(&sb->s_pins);
-	shrink_dcache_sb(sb);
-
 	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
 
+	if (remount_ro) {
+		if (sb->s_pins.first) {
+			up_write(&sb->s_umount);
+			acct_auto_close(&sb->s_pins);
+			down_write(&sb->s_umount);
+			if (!sb->s_root)
+				return 0;
+			if (sb->s_writers.frozen != SB_UNFROZEN)
+				return -EBUSY;
+			remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+		}
+	}
+	shrink_dcache_sb(sb);
+
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if (remount_ro) {
-- 
cgit v0.10.2


From 17c0a5aaffa63da6b5c73a31e36616bdcd12d143 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 07:35:19 -0400
Subject: make acct_kill() wait for file closing.

Do actual closing of file via schedule_work().  And use
__fput_sync() there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 6fd375f..d9ebc96 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -92,6 +92,8 @@ struct bsd_acct_struct {
 	unsigned long		needcheck;
 	struct file		*file;
 	struct pid_namespace	*ns;
+	struct work_struct	work;
+	struct completion	done;
 };
 
 static void acct_free_rcu(struct rcu_head *head)
@@ -176,15 +178,27 @@ again:
 	return res;
 }
 
+static void close_work(struct work_struct *work)
+{
+	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+	struct file *file = acct->file;
+	mnt_unpin(file->f_path.mnt);
+	if (file->f_op->flush)
+		file->f_op->flush(file, NULL);
+	__fput_sync(file);
+	complete(&acct->done);
+}
+
 static void acct_kill(struct bsd_acct_struct *acct,
 		      struct bsd_acct_struct *new)
 {
 	if (acct) {
-		struct file *file = acct->file;
 		struct pid_namespace *ns = acct->ns;
 		do_acct_process(acct);
-		mnt_unpin(file->f_path.mnt);
-		filp_close(file, NULL);
+		INIT_WORK(&acct->work, close_work);
+		init_completion(&acct->done);
+		schedule_work(&acct->work);
+		wait_for_completion(&acct->done);
 		spin_lock(&acct_lock);
 		hlist_del(&acct->m_list);
 		hlist_del(&acct->s_list);
-- 
cgit v0.10.2


From 215748e67d893169de9e62c3416e9e035e9e9c5f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 07:51:29 -0400
Subject: acct: move mnt_pin() upwards.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index d9ebc96..2d9e04d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -206,7 +206,6 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		ns->bacct = new;
 		if (new) {
 			struct vfsmount *m = new->file->f_path.mnt;
-			mnt_pin(m);
 			spin_lock(&acct_lock);
 			hlist_add_head(&new->s_list, &m->mnt_sb->s_pins);
 			hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins);
@@ -256,6 +255,7 @@ static int acct_on(struct filename *pathname)
 	acct->ns = ns;
 	mutex_init(&acct->lock);
 	mnt = file->f_path.mnt;
+	mnt_pin(mnt);
 
 	old = acct_get(ns);
 	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
@@ -264,7 +264,6 @@ static int acct_on(struct filename *pathname)
 	} else {
 		ns->bacct = acct;
 		spin_lock(&acct_lock);
-		mnt_pin(mnt);
 		hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins);
 		hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins);
 		spin_unlock(&acct_lock);
-- 
cgit v0.10.2


From 1629d0eb3ead0e0c49e4402049ec7b5b31b81cd7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 08:00:52 -0400
Subject: start carving bsd_acct_struct up

pull generic parts into struct fs_pin.  Eventually we want those
to replace mnt_pin()/mnt_unpin() mess; that stuff will move to
fs/*.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 2d9e04d..afeaaa6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -78,7 +78,7 @@ int acct_parm[3] = {4, 2, 30};
  */
 static void do_acct_process(struct bsd_acct_struct *acct);
 
-struct bsd_acct_struct {
+struct fs_pin {
 	atomic_long_t		count;
 	union {
 		struct {
@@ -87,6 +87,10 @@ struct bsd_acct_struct {
 		};
 		struct rcu_head rcu;
 	};
+};
+
+struct bsd_acct_struct {
+	struct fs_pin		pin;
 	struct mutex		lock;
 	int			active;
 	unsigned long		needcheck;
@@ -96,9 +100,9 @@ struct bsd_acct_struct {
 	struct completion	done;
 };
 
-static void acct_free_rcu(struct rcu_head *head)
+static void pin_free_rcu(struct rcu_head *head)
 {
-	kfree(container_of(head, struct bsd_acct_struct, rcu));
+	kfree(container_of(head, struct fs_pin, rcu));
 }
 
 static DEFINE_SPINLOCK(acct_lock);
@@ -138,15 +142,15 @@ out:
 	return acct->active;
 }
 
-static void acct_put(struct bsd_acct_struct *p)
+static void pin_put(struct fs_pin *p)
 {
 	if (atomic_long_dec_and_test(&p->count))
-		call_rcu(&p->rcu, acct_free_rcu);
+		call_rcu(&p->rcu, pin_free_rcu);
 }
 
 static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
 {
-	if (!atomic_long_inc_not_zero(&res->count)) {
+	if (!atomic_long_inc_not_zero(&res->pin.count)) {
 		rcu_read_unlock();
 		cpu_relax();
 		return NULL;
@@ -155,7 +159,7 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
 	mutex_lock(&res->lock);
 	if (!res->ns) {
 		mutex_unlock(&res->lock);
-		acct_put(res);
+		pin_put(&res->pin);
 		return NULL;
 	}
 	return res;
@@ -200,22 +204,22 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		schedule_work(&acct->work);
 		wait_for_completion(&acct->done);
 		spin_lock(&acct_lock);
-		hlist_del(&acct->m_list);
-		hlist_del(&acct->s_list);
+		hlist_del(&acct->pin.m_list);
+		hlist_del(&acct->pin.s_list);
 		spin_unlock(&acct_lock);
 		ns->bacct = new;
 		if (new) {
 			struct vfsmount *m = new->file->f_path.mnt;
 			spin_lock(&acct_lock);
-			hlist_add_head(&new->s_list, &m->mnt_sb->s_pins);
-			hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins);
+			hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins);
+			hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins);
 			spin_unlock(&acct_lock);
 			mutex_unlock(&new->lock);
 		}
 		acct->ns = NULL;
-		atomic_long_dec(&acct->count);
+		atomic_long_dec(&acct->pin.count);
 		mutex_unlock(&acct->lock);
-		acct_put(acct);
+		pin_put(&acct->pin);
 	}
 }
 
@@ -249,7 +253,7 @@ static int acct_on(struct filename *pathname)
 		return -EIO;
 	}
 
-	atomic_long_set(&acct->count, 1);
+	atomic_long_set(&acct->pin.count, 1);
 	acct->file = file;
 	acct->needcheck = jiffies;
 	acct->ns = ns;
@@ -264,8 +268,8 @@ static int acct_on(struct filename *pathname)
 	} else {
 		ns->bacct = acct;
 		spin_lock(&acct_lock);
-		hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins);
-		hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins);
+		hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins);
+		hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins);
 		spin_unlock(&acct_lock);
 		mutex_unlock(&acct->lock);
 	}
@@ -317,7 +321,7 @@ void acct_auto_close_mnt(struct hlist_head *list)
 			break;
 		acct_kill(__acct_get(hlist_entry(p,
 						 struct bsd_acct_struct,
-						 m_list)), NULL);
+						 pin.m_list)), NULL);
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
@@ -332,7 +336,7 @@ void acct_auto_close(struct hlist_head *list)
 			break;
 		acct_kill(__acct_get(hlist_entry(p,
 						 struct bsd_acct_struct,
-						 s_list)), NULL);
+						 pin.s_list)), NULL);
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
@@ -613,7 +617,7 @@ static void slow_acct_process(struct pid_namespace *ns)
 		if (acct) {
 			do_acct_process(acct);
 			mutex_unlock(&acct->lock);
-			acct_put(acct);
+			pin_put(&acct->pin);
 		}
 	}
 }
-- 
cgit v0.10.2


From efb170c22867cdc6f770de441bdefecec6712199 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 08:39:04 -0400
Subject: take fs_pin stuff to fs/*

Add a new field to fs_pin - kill(pin).  That's what umount and r/o remount
will be calling for all pins attached to vfsmount and superblock resp.
Called after bumping the refcount, so it won't go away under us.  Dropping
the refcount is responsibility of the instance.  All generic stuff moved to
fs/fs_pin.c; the next step will rip all the knowledge of kernel/acct.c from
fs/super.c and fs/namespace.c.  After that - death to mnt_pin(); it was
intended to be usable as generic mechanism for code that wants to attach
objects to vfsmount, so that they would not make the sucker busy and
would get killed on umount.  Never got it right; it remained acct.c-specific
all along.  Now it's very close to being killable.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/Makefile b/fs/Makefile
index 4030cbf..90c8852 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o statfs.o
+		stack.o fs_struct.o statfs.o fs_pin.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 0000000..f3ce0b8
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,77 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/fs_pin.h>
+#include "mount.h"
+
+static void pin_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct fs_pin, rcu));
+}
+
+static DEFINE_SPINLOCK(pin_lock);
+
+void pin_put(struct fs_pin *p)
+{
+	if (atomic_long_dec_and_test(&p->count))
+		call_rcu(&p->rcu, pin_free_rcu);
+}
+
+void pin_remove(struct fs_pin *pin)
+{
+	spin_lock(&pin_lock);
+	hlist_del(&pin->m_list);
+	hlist_del(&pin->s_list);
+	spin_unlock(&pin_lock);
+}
+
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+	spin_lock(&pin_lock);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+	spin_unlock(&pin_lock);
+}
+
+void acct_auto_close_mnt(struct hlist_head *list)
+{
+	while (1) {
+		struct hlist_node *p;
+		struct fs_pin *pin;
+		rcu_read_lock();
+		p = ACCESS_ONCE(list->first);
+		if (!p) {
+			rcu_read_unlock();
+			break;
+		}
+		pin = hlist_entry(p, struct fs_pin, m_list);
+		if (!atomic_long_inc_not_zero(&pin->count)) {
+			rcu_read_unlock();
+			cpu_relax();
+			continue;
+		}
+		rcu_read_unlock();
+		pin->kill(pin);
+	}
+}
+
+void acct_auto_close(struct hlist_head *list)
+{
+	while (1) {
+		struct hlist_node *p;
+		struct fs_pin *pin;
+		rcu_read_lock();
+		p = ACCESS_ONCE(list->first);
+		if (!p) {
+			rcu_read_unlock();
+			break;
+		}
+		pin = hlist_entry(p, struct fs_pin, s_list);
+		if (!atomic_long_inc_not_zero(&pin->count)) {
+			rcu_read_unlock();
+			cpu_relax();
+			continue;
+		}
+		rcu_read_unlock();
+		pin->kill(pin);
+	}
+}
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 65a4f88..1378379 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -24,18 +24,16 @@ struct super_block;
 struct pacct_struct;
 struct pid_namespace;
 extern int acct_parm[]; /* for sysctl */
-extern void acct_auto_close(struct hlist_head *);
-extern void acct_auto_close_mnt(struct hlist_head *);
 extern void acct_collect(long exitcode, int group_dead);
 extern void acct_process(void);
 extern void acct_exit_ns(struct pid_namespace *);
 #else
-#define acct_auto_close(x)	do { } while (0)
-#define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_collect(x,y)	do { } while (0)
 #define acct_process()		do { } while (0)
 #define acct_exit_ns(ns)	do { } while (0)
 #endif
+extern void acct_auto_close(struct hlist_head *);
+extern void acct_auto_close_mnt(struct hlist_head *);
 
 /*
  * ACCT_VERSION numbers as yet defined:
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
new file mode 100644
index 0000000..f66525e
--- /dev/null
+++ b/include/linux/fs_pin.h
@@ -0,0 +1,17 @@
+#include <linux/fs.h>
+
+struct fs_pin {
+	atomic_long_t		count;
+	union {
+		struct {
+			struct hlist_node	s_list;
+			struct hlist_node	m_list;
+		};
+		struct rcu_head rcu;
+	};
+	void (*kill)(struct fs_pin *);
+};
+
+void pin_put(struct fs_pin *);
+void pin_remove(struct fs_pin *);
+void pin_insert(struct fs_pin *, struct vfsmount *);
diff --git a/kernel/acct.c b/kernel/acct.c
index afeaaa6..a7993a6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,7 +59,7 @@
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
-#include <../fs/mount.h>	/* will go away when we refactor */
+#include <linux/fs_pin.h>
 
 /*
  * These constants control the amount of freespace that suspend and
@@ -78,17 +78,6 @@ int acct_parm[3] = {4, 2, 30};
  */
 static void do_acct_process(struct bsd_acct_struct *acct);
 
-struct fs_pin {
-	atomic_long_t		count;
-	union {
-		struct {
-			struct hlist_node	s_list;
-			struct hlist_node	m_list;
-		};
-		struct rcu_head rcu;
-	};
-};
-
 struct bsd_acct_struct {
 	struct fs_pin		pin;
 	struct mutex		lock;
@@ -100,13 +89,6 @@ struct bsd_acct_struct {
 	struct completion	done;
 };
 
-static void pin_free_rcu(struct rcu_head *head)
-{
-	kfree(container_of(head, struct fs_pin, rcu));
-}
-
-static DEFINE_SPINLOCK(acct_lock);
-
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
@@ -142,29 +124,6 @@ out:
 	return acct->active;
 }
 
-static void pin_put(struct fs_pin *p)
-{
-	if (atomic_long_dec_and_test(&p->count))
-		call_rcu(&p->rcu, pin_free_rcu);
-}
-
-static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res)
-{
-	if (!atomic_long_inc_not_zero(&res->pin.count)) {
-		rcu_read_unlock();
-		cpu_relax();
-		return NULL;
-	}
-	rcu_read_unlock();
-	mutex_lock(&res->lock);
-	if (!res->ns) {
-		mutex_unlock(&res->lock);
-		pin_put(&res->pin);
-		return NULL;
-	}
-	return res;
-}
-
 static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 {
 	struct bsd_acct_struct *res;
@@ -176,9 +135,18 @@ again:
 		rcu_read_unlock();
 		return NULL;
 	}
-	res = __acct_get(res);
-	if (!res)
+	if (!atomic_long_inc_not_zero(&res->pin.count)) {
+		rcu_read_unlock();
+		cpu_relax();
 		goto again;
+	}
+	rcu_read_unlock();
+	mutex_lock(&res->lock);
+	if (!res->ns) {
+		mutex_unlock(&res->lock);
+		pin_put(&res->pin);
+		goto again;
+	}
 	return res;
 }
 
@@ -203,19 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct,
 		init_completion(&acct->done);
 		schedule_work(&acct->work);
 		wait_for_completion(&acct->done);
-		spin_lock(&acct_lock);
-		hlist_del(&acct->pin.m_list);
-		hlist_del(&acct->pin.s_list);
-		spin_unlock(&acct_lock);
+		pin_remove(&acct->pin);
 		ns->bacct = new;
-		if (new) {
-			struct vfsmount *m = new->file->f_path.mnt;
-			spin_lock(&acct_lock);
-			hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins);
-			hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins);
-			spin_unlock(&acct_lock);
-			mutex_unlock(&new->lock);
-		}
 		acct->ns = NULL;
 		atomic_long_dec(&acct->pin.count);
 		mutex_unlock(&acct->lock);
@@ -223,6 +180,19 @@ static void acct_kill(struct bsd_acct_struct *acct,
 	}
 }
 
+static void acct_pin_kill(struct fs_pin *pin)
+{
+	struct bsd_acct_struct *acct;
+	acct = container_of(pin, struct bsd_acct_struct, pin);
+	mutex_lock(&acct->lock);
+	if (!acct->ns) {
+		mutex_unlock(&acct->lock);
+		pin_put(pin);
+		acct = NULL;
+	}
+	acct_kill(acct, NULL);
+}
+
 static int acct_on(struct filename *pathname)
 {
 	struct file *file;
@@ -254,25 +224,22 @@ static int acct_on(struct filename *pathname)
 	}
 
 	atomic_long_set(&acct->pin.count, 1);
+	acct->pin.kill = acct_pin_kill;
 	acct->file = file;
 	acct->needcheck = jiffies;
 	acct->ns = ns;
 	mutex_init(&acct->lock);
 	mnt = file->f_path.mnt;
 	mnt_pin(mnt);
+	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
+	pin_insert(&acct->pin, mnt);
 
 	old = acct_get(ns);
-	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
-	if (old) {
+	if (old)
 		acct_kill(old, acct);
-	} else {
+	else
 		ns->bacct = acct;
-		spin_lock(&acct_lock);
-		hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins);
-		hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins);
-		spin_unlock(&acct_lock);
-		mutex_unlock(&acct->lock);
-	}
+	mutex_unlock(&acct->lock);
 	mntput(mnt); /* it's pinned, now give up active reference */
 	return 0;
 }
@@ -312,36 +279,6 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 	return error;
 }
 
-void acct_auto_close_mnt(struct hlist_head *list)
-{
-	rcu_read_lock();
-	while (1) {
-		struct hlist_node *p = ACCESS_ONCE(list->first);
-		if (!p)
-			break;
-		acct_kill(__acct_get(hlist_entry(p,
-						 struct bsd_acct_struct,
-						 pin.m_list)), NULL);
-		rcu_read_lock();
-	}
-	rcu_read_unlock();
-}
-
-void acct_auto_close(struct hlist_head *list)
-{
-	rcu_read_lock();
-	while (1) {
-		struct hlist_node *p = ACCESS_ONCE(list->first);
-		if (!p)
-			break;
-		acct_kill(__acct_get(hlist_entry(p,
-						 struct bsd_acct_struct,
-						 pin.s_list)), NULL);
-		rcu_read_lock();
-	}
-	rcu_read_unlock();
-}
-
 void acct_exit_ns(struct pid_namespace *ns)
 {
 	acct_kill(acct_get(ns), NULL);
-- 
cgit v0.10.2


From 8fa1f1c2bd86007beb4a4845e6087ac4a704dc80 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 21 May 2014 18:22:52 -0400
Subject: make fs/{namespace,super}.c forget about acct.h

These externs belong in fs/internal.h.  Rename (they are not acct-specific
anymore) and move them over there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index f3ce0b8..9368236 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,6 +1,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/fs_pin.h>
+#include "internal.h"
 #include "mount.h"
 
 static void pin_free_rcu(struct rcu_head *head)
@@ -32,13 +33,13 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 	spin_unlock(&pin_lock);
 }
 
-void acct_auto_close_mnt(struct hlist_head *list)
+void mnt_pin_kill(struct mount *m)
 {
 	while (1) {
 		struct hlist_node *p;
 		struct fs_pin *pin;
 		rcu_read_lock();
-		p = ACCESS_ONCE(list->first);
+		p = ACCESS_ONCE(m->mnt_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
@@ -54,13 +55,13 @@ void acct_auto_close_mnt(struct hlist_head *list)
 	}
 }
 
-void acct_auto_close(struct hlist_head *list)
+void sb_pin_kill(struct super_block *sb)
 {
 	while (1) {
 		struct hlist_node *p;
 		struct fs_pin *pin;
 		rcu_read_lock();
-		p = ACCESS_ONCE(list->first);
+		p = ACCESS_ONCE(sb->s_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
diff --git a/fs/internal.h b/fs/internal.h
index 9a2edba..e325b4f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -143,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
  * pipe.c
  */
 extern const struct file_operations pipefifo_fops;
+
+/*
+ * fs_pin.c
+ */
+extern void sb_pin_kill(struct super_block *sb);
+extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22e530a..0e4ce51 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
-#include <linux/acct.h>		/* acct_auto_close_mnt */
 #include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
@@ -956,7 +955,7 @@ put_again:
 		mnt->mnt_pinned = 0;
 		rcu_read_unlock();
 		unlock_mount_hash();
-		acct_auto_close_mnt(&mnt->mnt_pins);
+		mnt_pin_kill(mnt);
 		goto put_again;
 	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
diff --git a/fs/super.c b/fs/super.c
index a369f89..a371ce6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
 
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -707,7 +706,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (remount_ro) {
 		if (sb->s_pins.first) {
 			up_write(&sb->s_umount);
-			acct_auto_close(&sb->s_pins);
+			sb_pin_kill(sb);
 			down_write(&sb->s_umount);
 			if (!sb->s_root)
 				return 0;
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 1378379..dccc2d4 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -32,8 +32,6 @@ extern void acct_exit_ns(struct pid_namespace *);
 #define acct_process()		do { } while (0)
 #define acct_exit_ns(ns)	do { } while (0)
 #endif
-extern void acct_auto_close(struct hlist_head *);
-extern void acct_auto_close_mnt(struct hlist_head *);
 
 /*
  * ACCT_VERSION numbers as yet defined:
-- 
cgit v0.10.2


From 3064c3563ba4c23e2c7a47254ec056ed9ba0098a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Aug 2014 09:12:31 -0400
Subject: death to mnt_pinned

Rather than playing silly buggers with vfsmount refcounts, just have
acct_on() ask fs/namespace.c for internal clone of file->f_path.mnt
and replace it with said clone.  Then attach the pin to original
vfsmount.  Voila - the clone will be alive until the file gets closed,
making sure that underlying superblock remains active, etc., and
we can drop the original vfsmount, so that it's not kept busy.
If the file lives until the final mntput of the original vfsmount,
we'll notice that there's an fs_pin (one in bsd_acct_struct that
holds that file) and mnt_pin_kill() will take it out.  Since
->kill() is synchronous, we won't proceed past that point until
these files are closed (and private clones of our vfsmount are
gone), so we get the same ordering warranties we used to get.

mnt_pin()/mnt_unpin()/->mnt_pinned is gone now, and good riddance -
it never became usable outside of kernel/acct.c (and racy wrt
umount even there).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/mount.h b/fs/mount.h
index 0a2d145..6740a62 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -55,7 +55,6 @@ struct mount {
 	int mnt_id;			/* mount identifier */
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
-	int mnt_pinned;
 	struct hlist_head mnt_pins;
 	struct path mnt_ex_mountpoint;
 };
diff --git a/fs/namespace.c b/fs/namespace.c
index 0e4ce51..65af9d0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -937,7 +937,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void mntput_no_expire(struct mount *mnt)
 {
-put_again:
 	rcu_read_lock();
 	mnt_add_count(mnt, -1);
 	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -950,14 +949,6 @@ put_again:
 		unlock_mount_hash();
 		return;
 	}
-	if (unlikely(mnt->mnt_pinned)) {
-		mnt_add_count(mnt, mnt->mnt_pinned + 1);
-		mnt->mnt_pinned = 0;
-		rcu_read_unlock();
-		unlock_mount_hash();
-		mnt_pin_kill(mnt);
-		goto put_again;
-	}
 	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
 		rcu_read_unlock();
 		unlock_mount_hash();
@@ -980,6 +971,8 @@ put_again:
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
+	if (unlikely(mnt->mnt_pins.first))
+		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1007,25 +1000,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-void mnt_pin(struct vfsmount *mnt)
+struct vfsmount *mnt_clone_internal(struct path *path)
 {
-	lock_mount_hash();
-	real_mount(mnt)->mnt_pinned++;
-	unlock_mount_hash();
-}
-EXPORT_SYMBOL(mnt_pin);
-
-void mnt_unpin(struct vfsmount *m)
-{
-	struct mount *mnt = real_mount(m);
-	lock_mount_hash();
-	if (mnt->mnt_pinned) {
-		mnt_add_count(mnt, 1);
-		mnt->mnt_pinned--;
-	}
-	unlock_mount_hash();
+	struct mount *p;
+	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
+	if (IS_ERR(p))
+		return ERR_CAST(p);
+	p->mnt.mnt_flags |= MNT_INTERNAL;
+	return &p->mnt;
 }
-EXPORT_SYMBOL(mnt_unpin);
 
 static inline void mangle(struct seq_file *m, const char *s)
 {
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 839bac2..864b120 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -62,6 +62,7 @@ struct vfsmount {
 };
 
 struct file; /* forward dec */
+struct path;
 
 extern int mnt_want_write(struct vfsmount *mnt);
 extern int mnt_want_write_file(struct file *file);
@@ -70,8 +71,7 @@ extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mnt_drop_write_file(struct file *file);
 extern void mntput(struct vfsmount *mnt);
 extern struct vfsmount *mntget(struct vfsmount *mnt);
-extern void mnt_pin(struct vfsmount *mnt);
-extern void mnt_unpin(struct vfsmount *mnt);
+extern struct vfsmount *mnt_clone_internal(struct path *path);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
 
 struct file_system_type;
diff --git a/kernel/acct.c b/kernel/acct.c
index a7993a6..2e6cf818 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -154,7 +154,6 @@ static void close_work(struct work_struct *work)
 {
 	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
 	struct file *file = acct->file;
-	mnt_unpin(file->f_path.mnt);
 	if (file->f_op->flush)
 		file->f_op->flush(file, NULL);
 	__fput_sync(file);
@@ -196,9 +195,10 @@ static void acct_pin_kill(struct fs_pin *pin)
 static int acct_on(struct filename *pathname)
 {
 	struct file *file;
-	struct vfsmount *mnt;
+	struct vfsmount *mnt, *internal;
 	struct pid_namespace *ns = task_active_pid_ns(current);
 	struct bsd_acct_struct *acct, *old;
+	int err;
 
 	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
 	if (!acct)
@@ -222,6 +222,21 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return -EIO;
 	}
+	internal = mnt_clone_internal(&file->f_path);
+	if (IS_ERR(internal)) {
+		kfree(acct);
+		filp_close(file, NULL);
+		return PTR_ERR(internal);
+	}
+	err = mnt_want_write(internal);
+	if (err) {
+		mntput(internal);
+		kfree(acct);
+		filp_close(file, NULL);
+		return err;
+	}
+	mnt = file->f_path.mnt;
+	file->f_path.mnt = internal;
 
 	atomic_long_set(&acct->pin.count, 1);
 	acct->pin.kill = acct_pin_kill;
@@ -229,8 +244,6 @@ static int acct_on(struct filename *pathname)
 	acct->needcheck = jiffies;
 	acct->ns = ns;
 	mutex_init(&acct->lock);
-	mnt = file->f_path.mnt;
-	mnt_pin(mnt);
 	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
 	pin_insert(&acct->pin, mnt);
 
@@ -240,7 +253,8 @@ static int acct_on(struct filename *pathname)
 	else
 		ns->bacct = acct;
 	mutex_unlock(&acct->lock);
-	mntput(mnt); /* it's pinned, now give up active reference */
+	mnt_drop_write(mnt);
+	mntput(mnt);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 2577d92ebd28dd9b3dacdfad6dcd81be0d21bbdf Mon Sep 17 00:00:00 2001
From: Ionut Alexa <ionut.m.alexa@gmail.com>
Date: Thu, 31 Jul 2014 09:28:36 +1000
Subject: kernel/acct.c: fix coding style warnings and errors

Signed-off-by: Ionut Alexa <ionut.m.alexa@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/kernel/acct.c b/kernel/acct.c
index 2e6cf818..b4c667d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -108,14 +108,14 @@ static int check_free_space(struct bsd_acct_struct *acct)
 		do_div(suspend, 100);
 		if (sbuf.f_bavail <= suspend) {
 			acct->active = 0;
-			printk(KERN_INFO "Process accounting paused\n");
+			pr_info("Process accounting paused\n");
 		}
 	} else {
 		u64 resume = sbuf.f_blocks * RESUME;
 		do_div(resume, 100);
 		if (sbuf.f_bavail >= resume) {
 			acct->active = 1;
-			printk(KERN_INFO "Process accounting resumed\n");
+			pr_info("Process accounting resumed\n");
 		}
 	}
 
@@ -280,6 +280,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 
 	if (name) {
 		struct filename *tmp = getname(name);
+
 		if (IS_ERR(tmp))
 			return PTR_ERR(tmp);
 		mutex_lock(&acct_on_mutex);
@@ -337,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
 	return exp;
 }
 
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 /*
  * encode an u64 into a comp2_t (24 bits)
  *
@@ -350,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
 #define MANTSIZE2       20                      /* 20 bit mantissa. */
 #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
 #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
+#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */
 
 static comp2_t encode_comp2_t(u64 value)
 {
@@ -381,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
 }
 #endif
 
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
 /*
  * encode an u64 into a 32 bit IEEE float
  */
@@ -390,8 +391,9 @@ static u32 encode_float(u64 value)
 	unsigned exp = 190;
 	unsigned u;
 
-	if (value==0) return 0;
-	while ((s64)value > 0){
+	if (value == 0)
+		return 0;
+	while ((s64)value > 0) {
 		value <<= 1;
 		exp--;
 	}
@@ -429,16 +431,17 @@ static void fill_ac(acct_t *ac)
 	run_time -= current->group_leader->start_time;
 	/* convert nsec -> AHZ */
 	elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
 	ac->ac_etime = encode_float(elapsed);
 #else
 	ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
-	                       (unsigned long) elapsed : (unsigned long) -1l);
+				(unsigned long) elapsed : (unsigned long) -1l);
 #endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 	{
 		/* new enlarged etime field */
 		comp2_t etime = encode_comp2_t(elapsed);
+
 		ac->ac_etime_hi = etime >> 16;
 		ac->ac_etime_lo = (u16) etime;
 	}
@@ -491,12 +494,12 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
 	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
 	/* backward-compatible 16 bit fields */
 	ac.ac_uid16 = ac.ac_uid;
 	ac.ac_gid16 = ac.ac_gid;
 #endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
 	ac.ac_pid = task_tgid_nr_ns(current, ns);
 	rcu_read_lock();
 	ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
@@ -530,6 +533,7 @@ void acct_collect(long exitcode, int group_dead)
 
 	if (group_dead && current->mm) {
 		struct vm_area_struct *vma;
+
 		down_read(&current->mm->mmap_sem);
 		vma = current->mm->mmap;
 		while (vma) {
-- 
cgit v0.10.2


From 7177a9c4b509eb357cc450256bc3cf39f1a1e639 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:30 +0200
Subject: fs: call rename2 if exists

Christoph Hellwig suggests:

1) make vfs_rename call ->rename2 if it exists instead of ->rename
2) switch all filesystems that you're adding NOREPLACE support for to
   use ->rename2
3) see how many ->rename instances we'll have left after a few
   iterations of 2.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8..b147a67 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.rmdir		= ext4_rmdir,
 	.mknod		= ext4_mknod,
 	.tmpfile	= ext4_tmpfile,
-	.rename		= ext4_rename,
 	.rename2	= ext4_rename2,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c60482..de1d84a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
 	return err;
 }
 
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
-		       struct inode *newdir, struct dentry *newent)
-{
-	return fuse_rename2(olddir, oldent, newdir, newent, 0);
-}
-
 static int fuse_link(struct dentry *entry, struct inode *newdir,
 		     struct dentry *newent)
 {
@@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.symlink	= fuse_symlink,
 	.unlink		= fuse_unlink,
 	.rmdir		= fuse_rmdir,
-	.rename		= fuse_rename,
 	.rename2	= fuse_rename2,
 	.link		= fuse_link,
 	.setattr	= fuse_setattr,
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e..0ff23ce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4075,7 +4075,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
-	if (!old_dir->i_op->rename)
+	if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
 		return -EPERM;
 
 	if (flags && !old_dir->i_op->rename2)
@@ -4134,10 +4134,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (error)
 			goto out;
 	}
-	if (!flags) {
+	if (!old_dir->i_op->rename2) {
 		error = old_dir->i_op->rename(old_dir, old_dentry,
 					      new_dir, new_dentry);
 	} else {
+		WARN_ON(old_dir->i_op->rename != NULL);
 		error = old_dir->i_op->rename2(old_dir, old_dentry,
 					       new_dir, new_dentry, flags);
 	}
-- 
cgit v0.10.2


From a0dbc56610b3e157f19241404e738744b7e7877e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:31 +0200
Subject: bad_inode: add ->rename2()

so we return -EIO instead of -EINVAL.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953..afd2b44 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
 	return -EIO;
 }
 
-static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
-		struct inode *new_dir, struct dentry *new_dentry)
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry,
+			     unsigned int flags)
 {
 	return -EIO;
 }
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
 	.mkdir		= bad_inode_mkdir,
 	.rmdir		= bad_inode_rmdir,
 	.mknod		= bad_inode_mknod,
-	.rename		= bad_inode_rename,
+	.rename2	= bad_inode_rename2,
 	.readlink	= bad_inode_readlink,
 	/* follow_link must be no-op, otherwise unmounting this inode
 	   won't work */
-- 
cgit v0.10.2


From 80ace85c915d0f41016f82917218997b72431258 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:32 +0200
Subject: btrfs: add RENAME_NOREPLACE

RENAME_NOREPLACE is trivial to implement for most filesystems: switch over
to ->rename2() and check for the supported flags.  The rest is done by the
VFS.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Chris Mason <clm@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048..3183742 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8476,6 +8476,16 @@ out_notrans:
 	return ret;
 }
 
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry,
+			 unsigned int flags)
+{
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
+	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
@@ -9019,7 +9029,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.link		= btrfs_link,
 	.mkdir		= btrfs_mkdir,
 	.rmdir		= btrfs_rmdir,
-	.rename		= btrfs_rename,
+	.rename2	= btrfs_rename2,
 	.symlink	= btrfs_symlink,
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
-- 
cgit v0.10.2


From 3b69ff51d087d265aa4af3a532fc4f20bf33e718 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:33 +0200
Subject: shmem: support RENAME_NOREPLACE

Implement ->rename2 instead of ->rename.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/shmem.c b/mm/shmem.c
index af68b15..fe95918 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2054,11 +2054,14 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
  * it exists so that the VFS layer correctly free's it when it
  * gets overwritten.
  */
-static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
 {
 	struct inode *inode = old_dentry->d_inode;
 	int they_are_dirs = S_ISDIR(inode->i_mode);
 
+	if (flags & ~(RENAME_NOREPLACE))
+		return -EINVAL;
+
 	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
 
@@ -2741,7 +2744,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.mkdir		= shmem_mkdir,
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
-	.rename		= shmem_rename,
+	.rename2	= shmem_rename2,
 	.tmpfile	= shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
-- 
cgit v0.10.2


From 37456771c58be10dd813fb4510035d0d67a969aa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:34 +0200
Subject: shmem: support RENAME_EXCHANGE

This is really simple in tmpfs since the VFS already takes care of
shuffling the dentries.  Just adjust nlink on parent directories and touch
c & mtimes.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/shmem.c b/mm/shmem.c
index fe95918..0f01800 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2048,6 +2048,28 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 	return shmem_unlink(dir, dentry);
 }
 
+static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+{
+	bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+
+	if (old_dir != new_dir && old_is_dir != new_is_dir) {
+		if (old_is_dir) {
+			drop_nlink(old_dir);
+			inc_nlink(new_dir);
+		} else {
+			drop_nlink(new_dir);
+			inc_nlink(old_dir);
+		}
+	}
+	old_dir->i_ctime = old_dir->i_mtime =
+	new_dir->i_ctime = new_dir->i_mtime =
+	old_dentry->d_inode->i_ctime =
+	new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+	return 0;
+}
+
 /*
  * The VFS layer already does all the dentry stuff for rename,
  * we just have to decrement the usage count for the target if
@@ -2059,9 +2081,12 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
 	struct inode *inode = old_dentry->d_inode;
 	int they_are_dirs = S_ISDIR(inode->i_mode);
 
-	if (flags & ~(RENAME_NOREPLACE))
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
 		return -EINVAL;
 
+	if (flags & RENAME_EXCHANGE)
+		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
 	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
 
-- 
cgit v0.10.2


From 9a423bb6e3577bb372942edfb5d9d26632741d43 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:35 +0200
Subject: hostfs: support rename flags

Support RENAME_NOREPLACE and RENAME_EXCHANGE flags on hostfs if the
underlying filesystem supports it.

Since renameat2(2) is not yet in any libc, use syscall(2) to invoke the
renameat2 syscall.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0..4fcd40d 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
 extern int link_file(const char *from, const char *to);
 extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
 		     long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3..fd62cae 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return err;
 }
 
-static int hostfs_rename(struct inode *from_ino, struct dentry *from,
-			 struct inode *to_ino, struct dentry *to)
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry,
+			  unsigned int flags)
 {
-	char *from_name, *to_name;
+	char *old_name, *new_name;
 	int err;
 
-	if ((from_name = dentry_name(from)) == NULL)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	old_name = dentry_name(old_dentry);
+	if (old_name == NULL)
 		return -ENOMEM;
-	if ((to_name = dentry_name(to)) == NULL) {
-		__putname(from_name);
+	new_name = dentry_name(new_dentry);
+	if (new_name == NULL) {
+		__putname(old_name);
 		return -ENOMEM;
 	}
-	err = rename_file(from_name, to_name);
-	__putname(from_name);
-	__putname(to_name);
+	if (!flags)
+		err = rename_file(old_name, new_name);
+	else
+		err = rename2_file(old_name, new_name, flags);
+
+	__putname(old_name);
+	__putname(new_name);
 	return err;
 }
 
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
 	.mkdir		= hostfs_mkdir,
 	.rmdir		= hostfs_rmdir,
 	.mknod		= hostfs_mknod,
-	.rename		= hostfs_rename,
+	.rename2	= hostfs_rename2,
 	.permission	= hostfs_permission,
 	.setattr	= hostfs_setattr,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3..9765dab 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
+#include <sys/syscall.h>
 #include "hostfs.h"
 #include <utime.h>
 
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
 	return 0;
 }
 
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+	int err;
+
+#ifndef SYS_renameat2
+#  ifdef __x86_64__
+#    define SYS_renameat2 316
+#  endif
+#  ifdef __i386__
+#    define SYS_renameat2 353
+#  endif
+#endif
+
+#ifdef SYS_renameat2
+	err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+	if (err < 0) {
+		if (errno != ENOSYS)
+			return -errno;
+		else
+			return -EINVAL;
+	}
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 	      long long *bfree_out, long long *bavail_out,
 	      long long *files_out, long long *ffree_out,
-- 
cgit v0.10.2


From 7c33d5972ce382bcc506d16235f1e9b7d22cbef8 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Jul 2014 15:15:36 +0200
Subject: cifs: support RENAME_NOREPLACE

This flag gives CIFS the ability to support its native rename semantics.

Implementation is simple: just bail out before trying to hack around the
noreplace semantics.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Steve French <smfrench@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8883980..ac4f260 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
 	.link = cifs_hardlink,
 	.mkdir = cifs_mkdir,
 	.rmdir = cifs_rmdir,
-	.rename = cifs_rename,
+	.rename2 = cifs_rename2,
 	.permission = cifs_permission,
 /*	revalidate:cifs_revalidate,   */
 	.setattr = cifs_setattr,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 70f178a..ed58c88 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
-		       struct dentry *);
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
+			struct dentry *, unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a174605..bec0a08 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1627,8 +1627,9 @@ do_rename_exit:
 }
 
 int
-cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
-	    struct inode *target_dir, struct dentry *target_dentry)
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
+	     struct inode *target_dir, struct dentry *target_dentry,
+	     unsigned int flags)
 {
 	char *from_name = NULL;
 	char *to_name = NULL;
@@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	unsigned int xid;
 	int rc, tmprc;
 
+	if (flags & ~RENAME_NOREPLACE)
+		return -EINVAL;
+
 	cifs_sb = CIFS_SB(source_dir->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 	rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
 			    to_name);
 
+	/*
+	 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+	 */
+	if (flags & RENAME_NOREPLACE)
+		goto cifs_rename_exit;
+
 	if (rc == -EEXIST && tcon->unix_ext) {
 		/*
 		 * Are src and dst hardlinks of same inode? We can only tell
-- 
cgit v0.10.2


From b8faf035ea9d0011b04856a8198e2e212d93346a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 4 Aug 2014 17:06:29 +1000
Subject: VFS: allow ->d_manage() to declare -EISDIR in rcu_walk mode.

In REF-walk mode, ->d_manage can return -EISDIR to indicate
that the dentry is not really a mount trap (or even a mount point)
and that any mounts or any DCACHE_NEED_AUTOMOUNT flag should be
ignored.

RCU-walk mode doesn't currently support this, so if there is a dentry
with DCACHE_NEED_AUTOMOUNT set but which shouldn't be a mount-trap,
lookup_fast() will always drop in REF-walk mode.

With this patch, an -EISDIR from ->d_manage will always cause mounts
and automounts to be ignored, both in REF-walk and RCU-walk.

Bug-fixed-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Ian Kent <raven@themaw.net>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index a1d0d7a..61d65cc 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -1053,7 +1053,8 @@ struct dentry_operations {
 	If the 'rcu_walk' parameter is true, then the caller is doing a
 	pathwalk in RCU-walk mode.  Sleeping is not permitted in this mode,
 	and the caller can be asked to leave it and call again by returning
-	-ECHILD.
+	-ECHILD.  -EISDIR may also be returned to tell pathwalk to
+	ignore d_automount or any mounts.
 
 	This function is only used if DCACHE_MANAGE_TRANSIT is set on the
 	dentry being transited from.
diff --git a/fs/namei.c b/fs/namei.c
index 0ff23ce..8a217c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline bool managed_dentry_might_block(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct dentry *dentry)
 {
-	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
-		dentry->d_op->d_manage(dentry, true) < 0);
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+		dentry->d_op->d_manage(dentry, true) : 0;
 }
 
 /*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 * Don't forget we might have a non-mountpoint managed dentry
 		 * that wants to block transit.
 		 */
-		if (unlikely(managed_dentry_might_block(path->dentry)))
+		switch (managed_dentry_rcu(path->dentry)) {
+		case -ECHILD:
+		default:
 			return false;
+		case -EISDIR:
+			return true;
+		case 0:
+			break;
+		}
 
 		if (!d_mountpoint(path->dentry))
-			return true;
+			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 
 		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
@@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 */
 		*inode = path->dentry->d_inode;
 	}
-	return read_seqretry(&mount_lock, nd->m_seq);
+	return read_seqretry(&mount_lock, nd->m_seq) &&
+		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
 static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
-		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
-			goto unlazy;
-		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-			goto unlazy;
-		return 0;
+		if (likely(__follow_mount_rcu(nd, path, inode)))
+			return 0;
 unlazy:
 		if (unlazy_walk(nd, dentry))
 			return -ECHILD;
-- 
cgit v0.10.2


From d03b29a271eb1d6de5af0f46cf0e7487e9e9284b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Feb 2014 16:52:33 -0500
Subject: namei: trivial fix to vfs_rename_dir comment

Looks like the directory loop check is actually done in renameat?
Whatever, leave this out rather than trying to keep it up to date with
the code.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namei.c b/fs/namei.c
index 8a217c4..a996bb4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4024,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  * The worst of all namespace operations - renaming directory. "Perverted"
  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
  * Problems:
- *	a) we can get into loop creation. Check is done in is_subdir().
+ *	a) we can get into loop creation.
  *	b) race potential - two innocent renames can create a loop together.
  *	   That's where 4.4 screws up. Current fix: serialization on
  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
-- 
cgit v0.10.2


From 3f70bd51cb4405dc5cf8624292ffa474679fc9c7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 18 Feb 2014 14:11:26 -0500
Subject: dcache: move d_splice_alias

Just a trivial move to locate it near (similar) d_materialise_unique
code and save some forward references in a following patch.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 06f6585..8bdae36 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1854,58 +1854,6 @@ struct dentry *d_obtain_alias(struct inode *inode)
 EXPORT_SYMBOL(d_obtain_alias);
 
 /**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode:  the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
- *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
- *
- * If a dentry was found and moved, then it is returned.  Otherwise NULL
- * is returned.  This matches the expected return value of ->lookup.
- *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
- */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
-{
-	struct dentry *new = NULL;
-
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	if (inode && S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		new = __d_find_alias(inode, 1);
-		if (new) {
-			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-			spin_unlock(&inode->i_lock);
-			security_d_instantiate(new, inode);
-			d_move(new, dentry);
-			iput(inode);
-		} else {
-			/* already taking inode->i_lock, so d_add() by hand */
-			__d_instantiate(dentry, inode);
-			spin_unlock(&inode->i_lock);
-			security_d_instantiate(dentry, inode);
-			d_rehash(dentry);
-		}
-	} else {
-		d_instantiate(dentry, inode);
-		if (d_unhashed(dentry))
-			d_rehash(dentry);
-	}
-	return new;
-}
-EXPORT_SYMBOL(d_splice_alias);
-
-/**
  * d_add_ci - lookup or allocate new dentry with case-exact name
  * @inode:  the inode case-insensitive lookup has found
  * @dentry: the negative dentry that was passed to the parent's lookup func
@@ -2697,6 +2645,58 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 }
 
 /**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
+ * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
+ * and return it, else simply d_add the inode to the dentry and return NULL.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+	struct dentry *new = NULL;
+
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (inode && S_ISDIR(inode->i_mode)) {
+		spin_lock(&inode->i_lock);
+		new = __d_find_alias(inode, 1);
+		if (new) {
+			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(new, inode);
+			d_move(new, dentry);
+			iput(inode);
+		} else {
+			/* already taking inode->i_lock, so d_add() by hand */
+			__d_instantiate(dentry, inode);
+			spin_unlock(&inode->i_lock);
+			security_d_instantiate(dentry, inode);
+			d_rehash(dentry);
+		}
+	} else {
+		d_instantiate(dentry, inode);
+		if (d_unhashed(dentry))
+			d_rehash(dentry);
+	}
+	return new;
+}
+EXPORT_SYMBOL(d_splice_alias);
+
+/**
  * d_materialise_unique - introduce an inode into the tree
  * @dentry: candidate dentry
  * @inode: inode to bind to the dentry, to which aliases may be attached
-- 
cgit v0.10.2


From 75a2352d0110960aeee1a28ddc09a55f97c99100 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Feb 2014 17:45:56 -0500
Subject: dcache: close d_move race in d_splice_alias

d_splice_alias will d_move an IS_ROOT() directory dentry into place if
one exists.  This should be safe as long as the dentry remains IS_ROOT,
but I can't see what guarantees that: once we drop the i_lock all we
hold here is the i_mutex on an unrelated parent directory.

Instead copy the logic of d_materialise_unique.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 8bdae36..8c09db9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2676,9 +2676,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			write_seqlock(&rename_lock);
+			__d_materialise_dentry(dentry, new);
+			write_sequnlock(&rename_lock);
+			__d_drop(new);
+			_d_rehash(new);
+			spin_unlock(&new->d_lock);
 			spin_unlock(&inode->i_lock);
 			security_d_instantiate(new, inode);
-			d_move(new, dentry);
 			iput(inode);
 		} else {
 			/* already taking inode->i_lock, so d_add() by hand */
-- 
cgit v0.10.2


From 908790fa3b779d37365e6b28e3aa0f6e833020c3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Feb 2014 17:58:42 -0500
Subject: dcache: d_splice_alias mustn't create directory aliases

Currently if d_splice_alias finds a directory with an alias that is not
IS_ROOT or not DCACHE_DISCONNECTED, it creates a duplicate directory.

Duplicate directory dentries are unacceptable; it is better just to
error out.

(In the case of a local filesystem the most likely case is filesystem
corruption: for example, perhaps two directories point to the same child
directory, and the other parent has already been found and cached.)

Note that distributed filesystems may encounter this case in normal
operation if a remote host moves a directory to a location different
from the one we last cached in the dcache.  For that reason, such
filesystems should instead use d_materialise_unique, which tries to move
the old directory alias to the right place instead of erroring out.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 8c09db9..a191eeb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2653,6 +2653,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
  * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
  * and return it, else simply d_add the inode to the dentry and return NULL.
  *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
  * This is needed in the lookup routine of any filesystem that is exportable
  * (via knfsd) so that we can build dcache paths to directories effectively.
  *
@@ -2673,9 +2676,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		spin_lock(&inode->i_lock);
-		new = __d_find_alias(inode, 1);
+		new = __d_find_any_alias(inode);
 		if (new) {
-			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+			if (!IS_ROOT(new) || !(new->d_flags & DCACHE_DISCONNECTED)) {
+				spin_unlock(&inode->i_lock);
+				dput(new);
+				return ERR_PTR(-EIO);
+			}
 			write_seqlock(&rename_lock);
 			__d_materialise_dentry(dentry, new);
 			write_sequnlock(&rename_lock);
-- 
cgit v0.10.2


From da093a9b76efca0a7a217af538929e1ecb204466 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 17 Feb 2014 18:03:57 -0500
Subject: dcache: d_splice_alias should ignore DCACHE_DISCONNECTED

Any IS_ROOT() alias should be safe to use; there's nothing special about
DCACHE_DISCONNECTED dentries.

Note that this is in fact useful for filesystems such as btrfs which can
legimately encounter a directory with a preexisting IS_ROOT alias on a
lookup that crosses into a subvolume.  (Those aliases are currently
marked DCACHE_DISCONNECTED--but not really for any good reason, and
we'll change that soon.)

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index a191eeb..3ed09536 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2649,9 +2649,9 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
  * @inode:  the inode which may have a disconnected dentry
  * @dentry: a negative dentry which we want to point to the inode.
  *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
  *
  * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
  * we should error out: directories can't have multiple aliases.
@@ -2678,7 +2678,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		spin_lock(&inode->i_lock);
 		new = __d_find_any_alias(inode);
 		if (new) {
-			if (!IS_ROOT(new) || !(new->d_flags & DCACHE_DISCONNECTED)) {
+			if (!IS_ROOT(new)) {
 				spin_unlock(&inode->i_lock);
 				dput(new);
 				return ERR_PTR(-EIO);
-- 
cgit v0.10.2


From 1a0a397e41cb1bf70cfe45fd0eeff08c7c501ec0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 14 Feb 2014 17:35:37 -0500
Subject: dcache: d_obtain_alias callers don't all want DISCONNECTED

There are a few d_obtain_alias callers that are using it to get the
root of a filesystem which may already have an alias somewhere else.

This is not the same as the filehandle-lookup case, and none of them
actually need DCACHE_DISCONNECTED set.

It isn't really a serious problem, but it would really be clearer if we
reserved DCACHE_DISCONNECTED for those cases where it's actually needed.

In the btrfs case this was causing a spurious printk from
nfsd/nfsfh.c:fh_verify when it found an unexpected DCACHE_DISCONNECTED
dentry.  Josef worked around this by unsetting DCACHE_DISCONNECTED
manually in 3a0dfa6a12e "Btrfs: unset DCACHE_DISCONNECTED when mounting
default subvol", and this replaces that workaround.

Cc: Josef Bacik <jbacik@fb.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca..67b48b9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct inode *inode;
-	struct dentry *dentry;
 	u64 dir_id;
 	int new = 0;
 
@@ -922,13 +921,7 @@ setup_root:
 		return dget(sb->s_root);
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry)) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_DISCONNECTED;
-		spin_unlock(&dentry->d_lock);
-	}
-	return dentry;
+	return d_obtain_root(inode);
 }
 
 static int btrfs_fill_super(struct super_block *sb,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd..f6e1237 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 				goto out;
 			}
 		} else {
-			root = d_obtain_alias(inode);
+			root = d_obtain_root(inode);
 		}
 		ceph_init_dentry(root);
 		dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/dcache.c b/fs/dcache.c
index 3ed09536..63d556c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1781,25 +1781,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_any_alias);
 
-/**
- * d_obtain_alias - find or allocate a dentry for a given inode
- * @inode: inode to allocate the dentry for
- *
- * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * similar open by handle operations.  The returned dentry may be anonymous,
- * or may have a full name (if the inode was already in the cache).
- *
- * When called on a directory inode, we must ensure that the inode only ever
- * has one dentry.  If a dentry is found, that is returned instead of
- * allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  In case of an error the reference on the inode is released.
- * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * be passed in and will be the error will be propagate to the return value,
- * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
- */
-struct dentry *d_obtain_alias(struct inode *inode)
+struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 {
 	static const struct qstr anonstring = QSTR_INIT("/", 1);
 	struct dentry *tmp;
@@ -1830,7 +1812,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	}
 
 	/* attach a disconnected dentry */
-	add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+	add_flags = d_flags_for_inode(inode);
+
+	if (disconnected)
+		add_flags |= DCACHE_DISCONNECTED;
 
 	spin_lock(&tmp->d_lock);
 	tmp->d_inode = inode;
@@ -1851,9 +1836,53 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
+
+/**
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
+ *
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_alias(struct inode *inode)
+{
+	return __d_obtain_alias(inode, 1);
+}
 EXPORT_SYMBOL(d_obtain_alias);
 
 /**
+ * d_obtain_root - find or allocate a dentry for a given inode
+ * @inode: inode to allocate the dentry for
+ *
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
+ *
+ * We must ensure that directory inodes only ever have one dentry.  If a
+ * dentry is found, that is returned instead of allocating a new one.
+ *
+ * On successful return, the reference to the inode has been transferred
+ * to the dentry.  In case of an error the reference on the inode is
+ * released.  A %NULL or IS_ERR inode may be passed in and will be the
+ * error will be propagate to the return value, with a %NULL @inode
+ * replaced by ERR_PTR(-ESTALE).
+ */
+struct dentry *d_obtain_root(struct inode *inode)
+{
+	return __d_obtain_alias(inode, 0);
+}
+EXPORT_SYMBOL(d_obtain_root);
+
+/**
  * d_add_ci - lookup or allocate new dentry with case-exact name
  * @inode:  the inode case-insensitive lookup has found
  * @dentry: the negative dentry that was passed to the parent's lookup func
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f804..880618a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	ret = d_obtain_alias(inode);
+	ret = d_obtain_root(inode);
 	if (IS_ERR(ret)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
 		goto out;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8c532b2..ac91499 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 			iput(inode);
 		}
 	} else {
-		dentry = d_obtain_alias(inode);
+		dentry = d_obtain_root(inode);
 		if (IS_ERR(dentry)) {
 			ret = PTR_ERR(dentry);
 			goto failed_dentry;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3c7ec32..e4ae2ad 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -249,6 +249,7 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
+extern struct dentry * d_obtain_root(struct inode *);
 extern void shrink_dcache_sb(struct super_block *);
 extern void shrink_dcache_parent(struct dentry *);
 extern void shrink_dcache_for_umount(struct super_block *);
-- 
cgit v0.10.2


From 52ed46f0fa88243887b823d24ccb9fcf47a735b3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Jan 2014 11:15:51 -0500
Subject: dcache: remove unused d_find_alias parameter

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 63d556c..5c5f3bd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
  *
  * If inode has a hashed alias, or is a directory and has any alias,
  * acquire the reference to alias and return it. Otherwise return NULL.
@@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
  * of a filesystem.
  *
  * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that one.
  */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
 {
 	struct dentry *alias, *discon_alias;
 
@@ -756,7 +753,7 @@ again:
 			if (IS_ROOT(alias) &&
 			    (alias->d_flags & DCACHE_DISCONNECTED)) {
 				discon_alias = alias;
-			} else if (!want_discon) {
+			} else {
 				__dget_dlock(alias);
 				spin_unlock(&alias->d_lock);
 				return alias;
@@ -787,7 +784,7 @@ struct dentry *d_find_alias(struct inode *inode)
 
 	if (!hlist_empty(&inode->i_dentry)) {
 		spin_lock(&inode->i_lock);
-		de = __d_find_alias(inode, 0);
+		de = __d_find_alias(inode);
 		spin_unlock(&inode->i_lock);
 	}
 	return de;
@@ -2765,7 +2762,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 		struct dentry *alias;
 
 		/* Does an aliased dentry already exist? */
-		alias = __d_find_alias(inode, 0);
+		alias = __d_find_alias(inode);
 		if (alias) {
 			actual = alias;
 			write_seqlock(&rename_lock);
-- 
cgit v0.10.2


From 8d80d7dabe9668965574669afbd31733f7b0fe9b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 16 Jan 2014 17:17:31 -0500
Subject: dcache: d_find_alias needn't recheck IS_ROOT && DCACHE_DISCONNECTED

If we get to this point and discover the dentry is not a root dentry, or
not DCACHE_DISCONNECTED--great, we always prefer that anyway.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c5f3bd..85a2aad 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -765,12 +765,9 @@ again:
 		alias = discon_alias;
 		spin_lock(&alias->d_lock);
 		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-			if (IS_ROOT(alias) &&
-			    (alias->d_flags & DCACHE_DISCONNECTED)) {
-				__dget_dlock(alias);
-				spin_unlock(&alias->d_lock);
-				return alias;
-			}
+			__dget_dlock(alias);
+			spin_unlock(&alias->d_lock);
+			return alias;
 		}
 		spin_unlock(&alias->d_lock);
 		goto again;
-- 
cgit v0.10.2


From 9635389534a765c8357d988e53d323bf033dcdd0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 18 Feb 2014 12:31:31 -0500
Subject: exportfs: update Exporting documentation

Minor documentation updates:
	- refer to d_obtain_alias rather than d_alloc_anon
	- explain when to use d_splice_alias and when
	  d_materialise_unique.
	- cut some details of d_splice_alias/d_materialise_unique
	  implementation.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index e543b1a..c8f036a 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -66,23 +66,31 @@ b/ A per-superblock list "s_anon" of dentries which are the roots of
 
 c/ Helper routines to allocate anonymous dentries, and to help attach
    loose directory dentries at lookup time. They are:
-    d_alloc_anon(inode) will return a dentry for the given inode.
+    d_obtain_alias(inode) will return a dentry for the given inode.
       If the inode already has a dentry, one of those is returned.
       If it doesn't, a new anonymous (IS_ROOT and
         DCACHE_DISCONNECTED) dentry is allocated and attached.
       In the case of a directory, care is taken that only one dentry
       can ever be attached.
-    d_splice_alias(inode, dentry) will make sure that there is a
-      dentry with the same name and parent as the given dentry, and
-      which refers to the given inode.
-      If the inode is a directory and already has a dentry, then that
-      dentry is d_moved over the given dentry.
-      If the passed dentry gets attached, care is taken that this is
-      mutually exclusive to a d_alloc_anon operation.
-      If the passed dentry is used, NULL is returned, else the used
-      dentry is returned.  This corresponds to the calling pattern of
-      ->lookup.
-  
+    d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
+      will introduce a new dentry into the tree; either the passed-in
+      dentry or a preexisting alias for the given inode (such as an
+      anonymous one created by d_obtain_alias), if appropriate.  The two
+      functions differ in their handling of directories with preexisting
+      aliases:
+        d_splice_alias will use any existing IS_ROOT dentry, but it will
+	  return -EIO rather than try to move a dentry with a different
+	  parent.  This is appropriate for local filesystems, which
+	  should never see such an alias unless the filesystem is
+	  corrupted somehow (for example, if two on-disk directory
+	  entries refer to the same directory.)
+	d_materialise_unique will attempt to move any dentry.  This is
+	  appropriate for distributed filesystems, where finding a
+	  directory other than where we last cached it may be a normal
+	  consequence of concurrent operations on other hosts.
+      Both functions return NULL when the passed-in dentry is used,
+      following the calling convention of ->lookup.
+
  
 Filesystem Issues
 -----------------
@@ -120,12 +128,12 @@ struct which has the following members:
 
   fh_to_dentry (mandatory)
     Given a filehandle fragment, this should find the implied object and
-    create a dentry for it (possibly with d_alloc_anon).
+    create a dentry for it (possibly with d_obtain_alias).
 
   fh_to_parent (optional but strongly recommended)
     Given a filehandle fragment, this should find the parent of the
-    implied object and create a dentry for it (possibly with d_alloc_anon).
-    May fail if the filehandle fragment is too small.
+    implied object and create a dentry for it (possibly with
+    d_obtain_alias).  May fail if the filehandle fragment is too small.
 
   get_parent (optional but strongly recommended)
     When given a dentry for a directory, this should return  a dentry for
-- 
cgit v0.10.2


From 95ad5c291313b66a98a44dc92b57e0b37c1dd589 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 12 Mar 2014 12:19:23 -0400
Subject: dcache: d_splice_alias should detect loops

I believe this can only happen in the case of a corrupted filesystem.
So -EIO looks like the appropriate error.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 85a2aad..ad13700 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2706,6 +2706,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 				dput(new);
 				return ERR_PTR(-EIO);
 			}
+			if (d_ancestor(new, dentry)) {
+				spin_unlock(&inode->i_lock);
+				dput(new);
+				return ERR_PTR(-EIO);
+			}
 			write_seqlock(&rename_lock);
 			__d_materialise_dentry(dentry, new);
 			write_sequnlock(&rename_lock);
-- 
cgit v0.10.2


From 49c7dd287adffc972e6dd6cf7011d63c7c5c2e10 Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 31 Jul 2014 17:59:02 -0400
Subject: fs: mark __d_obtain_alias static

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index ad13700..d30ce69 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1775,7 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_any_alias);
 
-struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 {
 	static const struct qstr anonstring = QSTR_INIT("/", 1);
 	struct dentry *tmp;
-- 
cgit v0.10.2


From c7f3888ad7f0932a87fb76e6e4edff2a90cc7920 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 18 Jun 2014 20:34:33 -0400
Subject: switch iov_iter_get_pages() to passing maximal number of pages

... instead of maximal size.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b0..c311640 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+	ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
 				&sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac262..912061a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
 		unsigned npages;
 		size_t start;
-		unsigned n = req->max_pages - req->num_pages;
 		ssize_t ret = iov_iter_get_pages(ii,
 					&req->pages[req->num_pages],
-					n * PAGE_SIZE, &start);
+					req->max_pages - req->num_pages,
+					&start);
 		if (ret < 0)
 			return ret;
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 09a7cffc..48d64e6 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -84,7 +84,7 @@ unsigned long iov_iter_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
-			size_t maxsize, size_t *start);
+			unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
 			size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index 7b5dbd1..ab88dc0 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -310,7 +310,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 EXPORT_SYMBOL(iov_iter_init);
 
 static ssize_t get_pages_iovec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	size_t offset = i->iov_offset;
@@ -323,10 +323,10 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
 	len = iov->iov_len - offset;
 	if (len > i->count)
 		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
 	addr = (unsigned long)iov->iov_base + offset;
 	len += *start = addr & (PAGE_SIZE - 1);
+	if (len > maxpages * PAGE_SIZE)
+		len = maxpages * PAGE_SIZE;
 	addr &= ~(PAGE_SIZE - 1);
 	n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
 	res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
@@ -588,15 +588,14 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
 }
 
 static ssize_t get_pages_bvec(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	const struct bio_vec *bvec = i->bvec;
 	size_t len = bvec->bv_len - i->iov_offset;
 	if (len > i->count)
 		len = i->count;
-	if (len > maxsize)
-		len = maxsize;
+	/* can't be more than PAGE_SIZE */
 	*start = bvec->bv_offset + i->iov_offset;
 
 	get_page(*pages = bvec->bv_page);
@@ -712,13 +711,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 EXPORT_SYMBOL(iov_iter_alignment);
 
 ssize_t iov_iter_get_pages(struct iov_iter *i,
-		   struct page **pages, size_t maxsize,
+		   struct page **pages, unsigned maxpages,
 		   size_t *start)
 {
 	if (i->type & ITER_BVEC)
-		return get_pages_bvec(i, pages, maxsize, start);
+		return get_pages_bvec(i, pages, maxpages, start);
 	else
-		return get_pages_iovec(i, pages, maxsize, start);
+		return get_pages_iovec(i, pages, maxpages, start);
 }
 EXPORT_SYMBOL(iov_iter_get_pages);
 
-- 
cgit v0.10.2


From 60bb45297f7551833346c5cebc6d483ea17ea5f2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 Aug 2014 12:39:16 -0400
Subject: __generic_file_write_iter(): fix handling of sync error after DIO

If DIO results in short write and sync write fails, we want to bugger off
whether the DIO part has written anything or not; the logics on the return
will take care of the right return value.

Cc: stable@vger.kernel.org [3.16]
Reported-by: Anton Altaparmakov <aia21@cam.ac.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/mm/filemap.c b/mm/filemap.c
index 900edfa..8163e04 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2584,7 +2584,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		 * that this differs from normal direct-io semantics, which
 		 * will return -EFOO even if some bytes were written.
 		 */
-		if (unlikely(status < 0) && !written) {
+		if (unlikely(status < 0)) {
 			err = status;
 			goto out;
 		}
-- 
cgit v0.10.2


From 12a5b5294cb1896e9a3c9fca8ff5a7e3def4e8c6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Aug 2014 03:44:55 -0400
Subject: fix copy_tree() regression

Since 3.14 we had copy_tree() get the shadowing wrong - if we had one
vfsmount shadowing another (i.e. if A is a slave of B, C is mounted
on A/foo, then D got mounted on B/foo creating D' on A/foo shadowed
by C), copy_tree() of A would make a copy of D' shadow the the copy of
C, not the other way around.

It's easy to fix, fortunately - just make sure that mount follows
the one that shadows it in mnt_child as well as in mnt_hash, and when
copy_tree() decides to attach a new mount, check if the last child
it has added to the same parent should be shadowing the new one.
And if it should, just use the same logics commit_tree() has - put the
new mount into the hash and children lists right after the one that
should shadow it.

Cc: stable@vger.kernel.org [3.14 and later]
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/namespace.c b/fs/namespace.c
index 65af9d0..be3f6f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -778,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
 	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 
+static void attach_shadowed(struct mount *mnt,
+			struct mount *parent,
+			struct mount *shadows)
+{
+	if (shadows) {
+		hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+		list_add(&mnt->mnt_child, &shadows->mnt_child);
+	} else {
+		hlist_add_head_rcu(&mnt->mnt_hash,
+				m_hash(&parent->mnt, mnt->mnt_mountpoint));
+		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	}
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -796,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 
 	list_splice(&head, n->list.prev);
 
-	if (shadows)
-		hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
-	else
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	attach_shadowed(mnt, parent, shadows);
 	touch_mnt_namespace(n);
 }
 
@@ -1474,6 +1483,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
+			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1495,7 +1505,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			attach_mnt(q, parent, p->mnt_mp);
+			mnt_set_mountpoint(parent, p->mnt_mp, q);
+			if (!list_empty(&parent->mnt_mounts)) {
+				t = list_last_entry(&parent->mnt_mounts,
+					struct mount, mnt_child);
+				if (t->mnt_mp != p->mnt_mp)
+					t = NULL;
+			}
+			attach_shadowed(q, parent, t);
 			unlock_mount_hash();
 		}
 	}
-- 
cgit v0.10.2