From 8654df4e2ac9704905198d63845554c2ddf6a93f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 16:06:06 -0500
Subject: mnt: Refactor fs_fully_visible into mount_too_revealing

Replace the call of fs_fully_visible in do_new_mount from before the
new superblock is allocated with a call of mount_too_revealing after
the superblock is allocated.   This winds up being a much better location
for maintainability of the code.

The first change this enables is the replacement of FS_USERNS_VISIBLE
with SB_I_USERNS_VISIBLE.  Moving the flag from struct filesystem_type
to sb_iflags on the superblock.

Unfortunately mount_too_revealing fundamentally needs to touch
mnt_flags adding several MNT_LOCKED_XXX flags at the appropriate
times.  If the mnt_flags did not need to be touched the code
could be easily moved into the filesystem specific mount code.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namespace.c b/fs/namespace.c
index 783004a..1a69aa7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2375,7 +2375,7 @@ unlock:
 	return err;
 }
 
-static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
 
 /*
  * create a new mount for userspace and request it to be added into the
@@ -2408,12 +2408,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
-		if (type->fs_flags & FS_USERNS_VISIBLE) {
-			if (!fs_fully_visible(type, &mnt_flags)) {
-				put_filesystem(type);
-				return -EPERM;
-			}
-		}
 	}
 
 	mnt = vfs_kern_mount(type, flags, name, data);
@@ -2425,6 +2419,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
+	if (mount_too_revealing(mnt, &mnt_flags)) {
+		mntput(mnt);
+		return -EPERM;
+	}
+
 	err = do_add_mount(real_mount(mnt), path, mnt_flags);
 	if (err)
 		mntput(mnt);
@@ -3216,22 +3215,19 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
-static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+				int *new_mnt_flags)
 {
-	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	int new_flags = *new_mnt_flags;
 	struct mount *mnt;
 	bool visible = false;
 
-	if (unlikely(!ns))
-		return false;
-
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		int mnt_flags;
 
-		if (mnt->mnt.mnt_sb->s_type != type)
+		if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
 			continue;
 
 		/* This mount is not fully visible if it's root directory
@@ -3298,6 +3294,22 @@ found:
 	return visible;
 }
 
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	unsigned long s_iflags;
+
+	if (ns->user_ns == &init_user_ns)
+		return false;
+
+	/* Can this filesystem be too revealing? */
+	s_iflags = mnt->mnt_sb->s_iflags;
+	if (!(s_iflags & SB_I_USERNS_VISIBLE))
+		return false;
+
+	return !mnt_already_visible(ns, mnt, new_mnt_flags);
+}
+
 static struct ns_common *mntns_get(struct task_struct *task)
 {
 	struct ns_common *ns = NULL;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 42305dd..78fa452 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -462,6 +462,7 @@ int proc_fill_super(struct super_block *s)
 	struct inode *root_inode;
 	int ret;
 
+	s->s_iflags |= SB_I_USERNS_VISIBLE;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 55bc7d6..a1b2860 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -158,7 +158,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f3db820..f31e369 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -42,7 +42,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	else if (new_sb)
 		/* Userspace would break if executables appear on sysfs */
-		root->d_sb->s_iflags |= SB_I_NOEXEC;
+		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
 
 	return root;
 }
@@ -59,7 +59,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28814..71988dd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1328,6 +1328,9 @@ struct mm_struct;
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
 
+/* sb->s_iflags to limit user namespace mounts */
+#define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
+
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
@@ -2011,7 +2014,6 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
-#define FS_USERNS_VISIBLE	32	/* FS must already be visible */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
-- 
cgit v0.10.2


From b236017acffa73d52eac9427f42d8993067d20fb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 May 2016 12:26:41 -0500
Subject: ipc: Initialize ipc_namespace->user_ns early.

Allow the ipc namespace initialization code to depend on ns->user_ns
being set during initialization.

In particular this allows mq_init_ns to use ns->user_ns for permission
checks and initializating s_user_ns while the the mq filesystem is
being mounted.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Suggested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/ipc/namespace.c b/ipc/namespace.c
index 068caf1..04cb07e 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -34,8 +34,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	ns->ns.ops = &ipcns_operations;
 
 	atomic_set(&ns->count, 1);
+	ns->user_ns = get_user_ns(user_ns);
+
 	err = mq_init_ns(ns);
 	if (err) {
+		put_user_ns(ns->user_ns);
 		ns_free_inum(&ns->ns);
 		kfree(ns);
 		return ERR_PTR(err);
@@ -46,8 +49,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	msg_init_ns(ns);
 	shm_init_ns(ns);
 
-	ns->user_ns = get_user_ns(user_ns);
-
 	return ns;
 }
 
-- 
cgit v0.10.2


From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 23 May 2016 14:51:59 -0500
Subject: vfs: Pass data, ns, and ns->userns to mount_ns

Today what is normally called data (the mount options) is not passed
to fill_super through mount_ns.

Pass the mount options and the namespace separately to mount_ns so
that filesystems such as proc that have mount options, can use
mount_ns.

Pass the user namespace to mount_ns so that the standard permission
check that verifies the mounter has permissions over the namespace can
be performed in mount_ns instead of in each filesystems .mount method.
Thus removing the duplication between mqueuefs and proc in terms of
permission checks.  The extra permission check does not currently
affect the rpc_pipefs filesystem and the nfsd filesystem as those
filesystems do not currently allow unprivileged mounts.  Without
unpvileged mounts it is guaranteed that the caller has already passed
capable(CAP_SYS_ADMIN) which guarantees extra permission check will
pass.

Update rpc_pipefs and the nfsd filesystem to ensure that the network
namespace reference is always taken in fill_super and always put in kill_sb
so that the logic is simpler and so that errors originating inside of
fill_super do not cause a network namespace leak.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9690cb4..5a6ae25 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1154,20 +1154,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 #endif
 		/* last one */ {""}
 	};
-	struct net *net = data;
-	int ret;
-
-	ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
-	if (ret)
-		return ret;
-	sb->s_fs_info = get_net(net);
-	return 0;
+	get_net(sb->s_fs_info);
+	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
 static struct dentry *nfsd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+	struct net *net = current->nsproxy->net_ns;
+	return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
 }
 
 static void nfsd_umount(struct super_block *sb)
diff --git a/fs/super.c b/fs/super.c
index d78b984..fd65667 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data)
 	return set_anon_super(sb, NULL);
 }
 
-struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-	void *data, int (*fill_super)(struct super_block *, void *, int))
+struct dentry *mount_ns(struct file_system_type *fs_type,
+	int flags, void *data, void *ns, struct user_namespace *user_ns,
+	int (*fill_super)(struct super_block *, void *, int))
 {
 	struct super_block *sb;
 
-	sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+	 * over the namespace.
+	 */
+	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 71988dd..1ce006a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2034,8 +2034,9 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
-	void *data, int (*fill_super)(struct super_block *, void *, int));
+extern struct dentry *mount_ns(struct file_system_type *fs_type,
+	int flags, void *data, void *ns, struct user_namespace *user_ns,
+	int (*fill_super)(struct super_block *, void *, int));
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ade739f..60d9708 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -305,7 +305,7 @@ err:
 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
-	struct ipc_namespace *ns = data;
+	struct ipc_namespace *ns = sb->s_fs_info;
 
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
@@ -326,17 +326,14 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name,
 			 void *data)
 {
-	if (!(flags & MS_KERNMOUNT)) {
-		struct ipc_namespace *ns = current->nsproxy->ipc_ns;
-		/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
-		 * over the ipc namespace.
-		 */
-		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
-			return ERR_PTR(-EPERM);
-
-		data = ns;
+	struct ipc_namespace *ns;
+	if (flags & MS_KERNMOUNT) {
+		ns = data;
+		data = NULL;
+	} else {
+		ns = current->nsproxy->ipc_ns;
 	}
-	return mount_ns(fs_type, flags, data, mqueue_fill_super);
+	return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super);
 }
 
 static void init_once(void *foo)
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fc48eca..84f98cb 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
 	struct dentry *root, *gssd_dentry;
-	struct net *net = data;
+	struct net *net = get_net(sb->s_fs_info);
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 	int err;
 
@@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 					   sb);
 	if (err)
 		goto err_depopulate;
-	sb->s_fs_info = get_net(net);
 	mutex_unlock(&sn->pipefs_sb_lock);
 	return 0;
 
@@ -1448,7 +1447,8 @@ static struct dentry *
 rpc_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super);
+	struct net *net = current->nsproxy->net_ns;
+	return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super);
 }
 
 static void rpc_kill_sb(struct super_block *sb)
@@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb)
 					   RPC_PIPEFS_UMOUNT,
 					   sb);
 	mutex_unlock(&sn->pipefs_sb_lock);
-	put_net(net);
 out:
 	kill_litter_super(sb);
+	put_net(net);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
-- 
cgit v0.10.2


From e94591d0d90c13166cb6eb54ce5f96ed13d81b55 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:32:10 -0500
Subject: proc: Convert proc_mount to use mount_ns.

Move the call of get_pid_ns, the call of proc_parse_options, and
the setting of s_iflags into proc_fill_super so that mount_ns
can be used.

Convert proc_mount to call mount_ns and remove the now unnecessary
code.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Reviewed-by: Djalal Harouni <tixxdz@gmail.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 78fa452..f4817ef 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -457,12 +457,17 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 	return inode;
 }
 
-int proc_fill_super(struct super_block *s)
+int proc_fill_super(struct super_block *s, void *data, int silent)
 {
+	struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
 	struct inode *root_inode;
 	int ret;
 
-	s->s_iflags |= SB_I_USERNS_VISIBLE;
+	if (!proc_parse_options(data, ns))
+		return -EINVAL;
+
+	/* User space would break if executables appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa27810..7931c55 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
 
 extern void proc_init_inodecache(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *);
+extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern void proc_entry_rundown(struct proc_dir_entry *);
 
 /*
@@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {}
  * root.c
  */
 extern struct proc_dir_entry proc_root;
+extern int proc_parse_options(char *options, struct pid_namespace *pid);
 
 extern void proc_self_init(void);
 extern int proc_remount(struct super_block *, int *, char *);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a1b2860..8d3e484 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -23,21 +23,6 @@
 
 #include "internal.h"
 
-static int proc_test_super(struct super_block *sb, void *data)
-{
-	return sb->s_fs_info == data;
-}
-
-static int proc_set_super(struct super_block *sb, void *data)
-{
-	int err = set_anon_super(sb, NULL);
-	if (!err) {
-		struct pid_namespace *ns = (struct pid_namespace *)data;
-		sb->s_fs_info = get_pid_ns(ns);
-	}
-	return err;
-}
-
 enum {
 	Opt_gid, Opt_hidepid, Opt_err,
 };
@@ -48,7 +33,7 @@ static const match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
-static int proc_parse_options(char *options, struct pid_namespace *pid)
+int proc_parse_options(char *options, struct pid_namespace *pid)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
@@ -100,45 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
 static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	int err;
-	struct super_block *sb;
 	struct pid_namespace *ns;
-	char *options;
 
 	if (flags & MS_KERNMOUNT) {
-		ns = (struct pid_namespace *)data;
-		options = NULL;
+		ns = data;
+		data = NULL;
 	} else {
 		ns = task_active_pid_ns(current);
-		options = data;
-
-		/* Does the mounter have privilege over the pid namespace? */
-		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
-			return ERR_PTR(-EPERM);
-	}
-
-	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!proc_parse_options(options, ns)) {
-		deactivate_locked_super(sb);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (!sb->s_root) {
-		err = proc_fill_super(sb);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= MS_ACTIVE;
-		/* User space would break if executables appear on proc */
-		sb->s_iflags |= SB_I_NOEXEC;
 	}
 
-	return dget(sb->s_root);
+	return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
 }
 
 static void proc_kill_sb(struct super_block *sb)
-- 
cgit v0.10.2


From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 24 May 2016 09:29:01 -0500
Subject: fs: Add user namespace member to struct super_block

Start marking filesystems with a user namespace owner, s_user_ns.  In
this change this is only used for permission checks of who may mount a
filesystem.  Ultimately s_user_ns will be used for translating ids and
checking capabilities for filesystems mounted from user namespaces.

The default policy for setting s_user_ns is implemented in sget(),
which arranges for s_user_ns to be set to current_user_ns() and to
ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that
user_ns.

The guts of sget are split out into another function sget_userns().
The function sget_userns calls alloc_super with the specified user
namespace or it verifies the existing superblock that was found
has the expected user namespace, and fails with EBUSY when it is not.
This failing prevents users with the wrong privileges mounting a
filesystem.

The reason for the split of sget_userns from sget is that in some
cases such as mount_ns and kernfs_mount_ns a different policy for
permission checking of mounts and setting s_user_ns is necessary, and
the existence of sget_userns() allows those policies to be
implemented.

The helper mount_ns is expected to be used for filesystems such as
proc and mqueuefs which present per namespace information.  The
function mount_ns is modified to call sget_userns instead of sget to
ensure the user namespace owner of the namespace whose information is
presented by the filesystem is used on the superblock.

For sysfs and cgroup the appropriate permission checks are already in
place, and kernfs_mount_ns is modified to call sget_userns so that
the init_user_ns is the only user namespace used.

For the cgroup filesystem cgroup namespace mounts are bind mounts of a
subset of the full cgroup filesystem and as such s_user_ns must be the
same for all of them as there is only a single superblock.

Mounts of sysfs that vary based on the network namespace could in principle
change s_user_ns but it keeps the analysis and implementation of kernfs
simpler if that is not supported, and at present there appear to be no
benefits from supporting a different s_user_ns on any sysfs mount.

Getting the details of setting s_user_ns correct has been
a long process.  Thanks to Pavel Tikhorirorv who spotted a leak
in sget_userns.  Thanks to Seth Forshee who has kept the work alive.

Thanks-to: Seth Forshee <seth.forshee@canonical.com>
Thanks-to: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 63534f5..d90d574 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 	info->root = root;
 	info->ns = ns;
 
-	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+			 &init_user_ns, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
 	if (IS_ERR(sb))
diff --git a/fs/super.c b/fs/super.c
index fd65667..874c7e3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 
@@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s)
 	list_lru_destroy(&s->s_inode_lru);
 	security_sb_free(s);
 	WARN_ON(!list_empty(&s->s_mounts));
+	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
 	call_rcu(&s->rcu, destroy_super_rcu);
@@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s)
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
  *	@flags: the mount flags
+ *	@user_ns: User namespace for the super_block
  *
  *	Allocates and initializes a new &struct super_block.  alloc_super()
  *	returns a pointer new superblock or %NULL if allocation had failed.
  */
-static struct super_block *alloc_super(struct file_system_type *type, int flags)
+static struct super_block *alloc_super(struct file_system_type *type, int flags,
+				       struct user_namespace *user_ns)
 {
 	struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
 	static const struct super_operations default_op;
@@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		return NULL;
 
 	INIT_LIST_HEAD(&s->s_mounts);
+	s->s_user_ns = get_user_ns(user_ns);
 
 	if (security_sb_alloc(s))
 		goto fail;
@@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb)
 EXPORT_SYMBOL(generic_shutdown_super);
 
 /**
- *	sget	-	find or create a superblock
+ *	sget_userns -	find or create a superblock
  *	@type:	filesystem type superblock should belong to
  *	@test:	comparison callback
  *	@set:	setup callback
  *	@flags:	mount flags
+ *	@user_ns: User namespace for the super_block
  *	@data:	argument to each of them
  */
-struct super_block *sget(struct file_system_type *type,
+struct super_block *sget_userns(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags,
+			int flags, struct user_namespace *user_ns,
 			void *data)
 {
 	struct super_block *s = NULL;
@@ -466,6 +472,14 @@ retry:
 		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
 			if (!test(old, data))
 				continue;
+			if (user_ns != old->s_user_ns) {
+				spin_unlock(&sb_lock);
+				if (s) {
+					up_write(&s->s_umount);
+					destroy_super(s);
+				}
+				return ERR_PTR(-EBUSY);
+			}
 			if (!grab_super(old))
 				goto retry;
 			if (s) {
@@ -478,7 +492,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags);
+		s = alloc_super(type, flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -501,6 +515,31 @@ retry:
 	return s;
 }
 
+EXPORT_SYMBOL(sget_userns);
+
+/**
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+			int (*test)(struct super_block *,void *),
+			int (*set)(struct super_block *,void *),
+			int flags,
+			void *data)
+{
+	struct user_namespace *user_ns = current_user_ns();
+
+	/* Ensure the requestor has permissions over the target filesystem */
+	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	return sget_userns(type, test, set, flags, user_ns, data);
+}
+
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
@@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns);
+	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+			 user_ns, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1ce006a..9eef64f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1433,6 +1433,13 @@ struct super_block {
 	struct hlist_head s_pins;
 
 	/*
+	 * Owning user namespace and default context in which to
+	 * interpret filesystem uids, gids, quotas, device nodes,
+	 * xattrs and security labels.
+	 */
+	struct user_namespace *s_user_ns;
+
+	/*
 	 * Keep the lru lists last in the structure so they always sit on their
 	 * own individual cachelines.
 	 */
@@ -2056,6 +2063,11 @@ void deactivate_locked_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
+struct super_block *sget_userns(struct file_system_type *type,
+			int (*test)(struct super_block *,void *),
+			int (*set)(struct super_block *,void *),
+			int flags, struct user_namespace *user_ns,
+			void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-- 
cgit v0.10.2


From a001e74cef34d95ede6535ef521011c612657a3a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 6 Jun 2016 15:48:04 -0500
Subject: mnt: Move the FS_USERNS_MOUNT check into sget_userns

Allowing a filesystem to be mounted by other than root in the initial
user namespace is a filesystem property not a mount namespace property
and as such should be checked in filesystem specific code.  Move the
FS_USERNS_MOUNT test into super.c:sget_userns().

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namespace.c b/fs/namespace.c
index 1a69aa7..2e13f6c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2397,10 +2397,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 		return -ENODEV;
 
 	if (user_ns != &init_user_ns) {
-		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
-			put_filesystem(type);
-			return -EPERM;
-		}
 		/* Only in special cases allow devices from mounts
 		 * created outside the initial user namespace.
 		 */
diff --git a/fs/super.c b/fs/super.c
index 874c7e3..78790ad 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -466,6 +466,10 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
+	if (!(flags & MS_KERNMOUNT) &&
+	    !(type->fs_flags & FS_USERNS_MOUNT) &&
+	    !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
-- 
cgit v0.10.2


From 29a517c232d21a717aecea29838aeb07131f6196 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jun 2016 13:03:05 -0500
Subject: kernfs: The cgroup filesystem also benefits from SB_I_NOEXEC

The cgroup filesystem is in the same boat as sysfs.  No one ever
permits executables of any kind on the cgroup filesystem, and there is
no reasonable future case to support executables in the future.

Therefore move the setting of SB_I_NOEXEC which makes the code proof
against future mistakes of accidentally creating executables from
sysfs to kernfs itself.  Making the code simpler and covering the
sysfs, cgroup, and cgroup2 filesystems.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d90d574..1443df6 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
+	/* Userspace would break if executables appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f31e369..20b8f82 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	else if (new_sb)
-		/* Userspace would break if executables appear on sysfs */
-		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
 
 	return root;
 }
-- 
cgit v0.10.2


From 3ee690143c3c99f6c0e83f08ff17556890bc6027 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 May 2016 15:42:21 -0500
Subject: ipc/mqueue: The mqueue filesystem should never contain executables

Set SB_I_NOEXEC on mqueuefs to ensure small implementation mistakes
do not result in executable on mqueuefs by accident.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 60d9708..5bdd50d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -307,6 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct ipc_namespace *ns = sb->s_fs_info;
 
+	sb->s_iflags |= SB_I_NOEXEC;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
-- 
cgit v0.10.2


From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:34:02 -0500
Subject: vfs: Generalize filesystem nodev handling.

Introduce a function may_open_dev that tests MNT_NODEV and a new
superblock flab SB_I_NODEV.  Use this new function in all of the
places where MNT_NODEV was previously tested.

Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those
filesystems should never support device nodes, and a simple superblock
flags makes that very hard to get wrong.  With SB_I_NODEV set if any
device nodes somehow manage to show up on on a filesystem those
device nodes will be unopenable.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71ccab1..30b8d56 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname)
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
-	if (path.mnt->mnt_flags & MNT_NODEV)
+	if (!may_open_dev(&path))
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 1443df6..b3d73ad 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
-	/* Userspace would break if executables appear on sysfs */
-	sb->s_iflags |= SB_I_NOEXEC;
+	/* Userspace would break if executables or devices appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/namei.c b/fs/namei.c
index 6a82fb7..757a3272 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 EXPORT_SYMBOL(vfs_create);
 
+bool may_open_dev(const struct path *path)
+{
+	return !(path->mnt->mnt_flags & MNT_NODEV) &&
+		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
 static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
@@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 		break;
 	case S_IFBLK:
 	case S_IFCHR:
-		if (path->mnt->mnt_flags & MNT_NODEV)
+		if (!may_open_dev(path))
 			return -EACCES;
 		/*FALLTHRU*/
 	case S_IFIFO:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f4817ef..a5b2c33 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 	if (!proc_parse_options(data, ns))
 		return -EINVAL;
 
-	/* User space would break if executables appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+	/* User space would break if executables or devices appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eef64f..e059831 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1327,6 +1327,7 @@ struct mm_struct;
 /* sb->s_iflags */
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
+#define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
 
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
@@ -1602,6 +1603,7 @@ extern int vfs_whiteout(struct inode *, struct dentry *);
  */
 extern void inode_init_owner(struct inode *inode, const struct inode *dir,
 			umode_t mode);
+extern bool may_open_dev(const struct path *path);
 /*
  * VFS FS_IOC_FIEMAP helper definitions.
  */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5bdd50d..0b13ace 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -307,7 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct ipc_namespace *ns = sb->s_fs_info;
 
-	sb->s_iflags |= SB_I_NOEXEC;
+	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
-- 
cgit v0.10.2


From a1935c1738af53249a02290ff7c10e8a6e650a16 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 Jun 2016 06:59:49 -0500
Subject: mnt: Simplify mount_too_revealing

Verify all filesystems that we check in mount_too_revealing set
SB_I_NOEXEC and SB_I_NODEV in sb->s_iflags.  That is true for today
and it should remain true in the future.

Remove the now unnecessary checks from mnt_already_visibile that
ensure MNT_LOCK_NOSUID, MNT_LOCK_NOEXEC, and MNT_LOCK_NODEV are
preserved.  Making the code shorter and easier to read.

Relying on SB_I_NOEXEC and SB_I_NODEV instead of the user visible
MNT_NOSUID, MNT_NOEXEC, and MNT_NODEV ensures the many current
systems where proc and sysfs are mounted with "nosuid, nodev, noexec"
and several slightly buggy container applications don't bother to
set those flags continue to work.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namespace.c b/fs/namespace.c
index 2e13f6c..b1da7f8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3232,12 +3232,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
 			continue;
 
-		/* Read the mount flags and filter out flags that
-		 * may safely be ignored.
-		 */
+		/* A local view of the mount flags */
 		mnt_flags = mnt->mnt.mnt_flags;
-		if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
-			mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
 
 		/* Don't miss readonly hidden in the superblock flags */
 		if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
@@ -3249,15 +3245,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		if ((mnt_flags & MNT_LOCK_READONLY) &&
 		    !(new_flags & MNT_READONLY))
 			continue;
-		if ((mnt_flags & MNT_LOCK_NODEV) &&
-		    !(new_flags & MNT_NODEV))
-			continue;
-		if ((mnt_flags & MNT_LOCK_NOSUID) &&
-		    !(new_flags & MNT_NOSUID))
-			continue;
-		if ((mnt_flags & MNT_LOCK_NOEXEC) &&
-		    !(new_flags & MNT_NOEXEC))
-			continue;
 		if ((mnt_flags & MNT_LOCK_ATIME) &&
 		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
 			continue;
@@ -3277,9 +3264,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 		}
 		/* Preserve the locked attributes */
 		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
-					       MNT_LOCK_NODEV    | \
-					       MNT_LOCK_NOSUID   | \
-					       MNT_LOCK_NOEXEC   | \
 					       MNT_LOCK_ATIME);
 		visible = true;
 		goto found;
@@ -3292,6 +3276,7 @@ found:
 
 static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 {
+	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	unsigned long s_iflags;
 
@@ -3303,6 +3288,12 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 	if (!(s_iflags & SB_I_USERNS_VISIBLE))
 		return false;
 
+	if ((s_iflags & required_iflags) != required_iflags) {
+		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+			  required_iflags);
+		return true;
+	}
+
 	return !mnt_already_visible(ns, mnt, new_mnt_flags);
 }
 
-- 
cgit v0.10.2


From 67690f937c38bbab1d94cb45f6a32e61612834ae Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 18 May 2016 13:50:06 -0500
Subject: userns: Remove implicit MNT_NODEV fragility.

Replace the implict setting of MNT_NODEV on mounts that happen with
just user namespace permissions with an implicit setting of SB_I_NODEV
in s_iflags.  The visibility of the implicit MNT_NODEV has caused
problems in the past.

With this change the fragile case where an implicit MNT_NODEV needs to
be preserved in do_remount is removed.  Using SB_I_NODEV is much less
fragile as s_iflags are set during the original mount and never
changed.

In do_new_mount with the implicit setting of MNT_NODEV gone, the only
code that can affect mnt_flags is fs_fully_visible so simplify the if
statement and reduce the indentation of the code to make that clear.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namespace.c b/fs/namespace.c
index b1da7f8..9786a38 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2185,13 +2185,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
 	    !(mnt_flags & MNT_NODEV)) {
-		/* Was the nodev implicitly added in mount? */
-		if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
-		    !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
-			mnt_flags |= MNT_NODEV;
-		} else {
-			return -EPERM;
-		}
+		return -EPERM;
 	}
 	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
 	    !(mnt_flags & MNT_NOSUID)) {
@@ -2385,7 +2379,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
-	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 
@@ -2396,16 +2389,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (!type)
 		return -ENODEV;
 
-	if (user_ns != &init_user_ns) {
-		/* Only in special cases allow devices from mounts
-		 * created outside the initial user namespace.
-		 */
-		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
-			flags |= MS_NODEV;
-			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
-		}
-	}
-
 	mnt = vfs_kern_mount(type, flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
diff --git a/fs/super.c b/fs/super.c
index 78790ad..25cdceed2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -206,6 +206,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
+	if ((s->s_user_ns != &init_user_ns) &&
+	    !(type->fs_flags & FS_USERNS_DEV_MOUNT))
+		s->s_iflags |= SB_I_NODEV;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	mutex_init(&s->s_sync_lock);
-- 
cgit v0.10.2


From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:44:48 -0500
Subject: userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag

Now that SB_I_NODEV controls the nodev behavior devpts can just clear
this flag during mount.  Simplifying the code and making it easier
to audit how the code works.  While still preserving the invariant
that s_iflags is only modified during mount.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 37c134a..d116453 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode *inode;
 
+	s->s_iflags &= ~SB_I_NODEV;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
@@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 /*
diff --git a/fs/super.c b/fs/super.c
index 25cdceed2..37813bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
 	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	if ((s->s_user_ns != &init_user_ns) &&
-	    !(type->fs_flags & FS_USERNS_DEV_MOUNT))
+	if (s->s_user_ns != &init_user_ns)
 		s->s_iflags |= SB_I_NODEV;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e059831..375e37f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2022,7 +2022,6 @@ struct file_system_type {
 #define FS_BINARY_MOUNTDATA	2
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
-#define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
-- 
cgit v0.10.2


From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Wed, 23 Sep 2015 15:16:04 -0500
Subject: fs: Limit file caps to the user namespace of the super block

Capability sets attached to files must be ignored except in the
user namespaces where the mounter is privileged, i.e. s_user_ns
and its descendants. Otherwise a vector exists for gaining
privileges in namespaces where a user is not already privileged.

Add a new helper function, current_in_user_ns(), to test whether a user
namespace is the same as or a descendant of another namespace.
Use this helper to determine whether a file's capability set
should be applied to the caps constructed during exec.

--EWB Replaced in_userns with the simpler current_in_userns.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8297e5b..9217169 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -72,6 +72,7 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
 extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
 extern int proc_setgroups_show(struct seq_file *m, void *v);
 extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool current_in_userns(const struct user_namespace *target_ns);
 #else
 
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -100,6 +101,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
 {
 	return true;
 }
+
+static inline bool current_in_userns(const struct user_namespace *target_ns)
+{
+	return true;
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9bafc21..68f5942 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns)
 	return allowed;
 }
 
+/*
+ * Returns true if @ns is the same namespace as or a descendant of
+ * @target_ns.
+ */
+bool current_in_userns(const struct user_namespace *target_ns)
+{
+	struct user_namespace *ns;
+	for (ns = current_user_ns(); ns; ns = ns->parent) {
+		if (ns == target_ns)
+			return true;
+	}
+	return false;
+}
+
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct user_namespace, ns);
diff --git a/security/commoncap.c b/security/commoncap.c
index e7fadde..e109e6d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -455,6 +455,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
 		return 0;
+	if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns))
+		return 0;
 
 	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
-- 
cgit v0.10.2


From 380cf5ba6b0a0b307f4afb62b186ca801defb203 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 23 Jun 2016 16:41:05 -0500
Subject: fs: Treat foreign mounts as nosuid

If a process gets access to a mount from a different user
namespace, that process should not be able to take advantage of
setuid files or selinux entrypoints from that filesystem.  Prevent
this by treating mounts from other mount namespaces and those not
owned by current_user_ns() or an ancestor as nosuid.

This will make it safer to allow more complex filesystems to be
mounted in non-root user namespaces.

This does not remove the need for MNT_LOCK_NOSUID.  The setuid,
setgid, and file capability bits can no longer be abused if code in
a user namespace were to clear nosuid on an untrusted filesystem,
but this patch, by itself, is insufficient to protect the system
from abuse of files that, when execed, would increase MAC privilege.

As a more concrete explanation, any task that can manipulate a
vfsmount associated with a given user namespace already has
capabilities in that namespace and all of its descendents.  If they
can cause a malicious setuid, setgid, or file-caps executable to
appear in that mount, then that executable will only allow them to
elevate privileges in exactly the set of namespaces in which they
are already privileges.

On the other hand, if they can cause a malicious executable to
appear with a dangerous MAC label, running it could change the
caller's security context in a way that should not have been
possible, even inside the namespace in which the task is confined.

As a hardening measure, this would have made CVE-2014-5207 much
more difficult to exploit.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/fs/exec.c b/fs/exec.c
index 887c1c9..ca239fc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1411,7 +1411,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 	bprm->cred->euid = current_euid();
 	bprm->cred->egid = current_egid();
 
-	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+	if (!mnt_may_suid(bprm->file->f_path.mnt))
 		return;
 
 	if (task_no_new_privs(current))
diff --git a/fs/namespace.c b/fs/namespace.c
index 9786a38..aabe8e3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3280,6 +3280,19 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
 	return !mnt_already_visible(ns, mnt, new_mnt_flags);
 }
 
+bool mnt_may_suid(struct vfsmount *mnt)
+{
+	/*
+	 * Foreign mounts (accessed via fchdir or through /proc
+	 * symlinks) are always treated as if they are nosuid.  This
+	 * prevents namespaces from trusting potentially unsafe
+	 * suid/sgid bits, file caps, or security labels that originate
+	 * in other namespaces.
+	 */
+	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+	       current_in_userns(mnt->mnt_sb->s_user_ns);
+}
+
 static struct ns_common *mntns_get(struct task_struct *task)
 {
 	struct ns_common *ns = NULL;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index f822c3c..54a594d 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -81,6 +81,7 @@ extern void mntput(struct vfsmount *mnt);
 extern struct vfsmount *mntget(struct vfsmount *mnt);
 extern struct vfsmount *mnt_clone_internal(struct path *path);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
+extern bool mnt_may_suid(struct vfsmount *mnt);
 
 struct path;
 extern struct vfsmount *clone_private_mount(struct path *path);
diff --git a/security/commoncap.c b/security/commoncap.c
index e109e6d..14540bd 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -453,8 +453,14 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 	if (!file_caps_enabled)
 		return 0;
 
-	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+	if (!mnt_may_suid(bprm->file->f_path.mnt))
 		return 0;
+
+	/*
+	 * This check is redundant with mnt_may_suid() but is kept to make
+	 * explicit that capability bits are limited to s_user_ns and its
+	 * descendants.
+	 */
 	if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns))
 		return 0;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a86d537..1554175 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2259,7 +2259,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
 			    const struct task_security_struct *new_tsec)
 {
 	int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
-	int nosuid = (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID);
+	int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
 	int rc;
 
 	if (!nnp && !nosuid)
-- 
cgit v0.10.2


From 9f50eda2a9277e0bc51d8ca5dd2ec1d0e73601bc Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Wed, 23 Sep 2015 15:16:06 -0500
Subject: Smack: Add support for unprivileged mounts from user namespaces

Security labels from unprivileged mounts cannot be trusted.
Ideally for these mounts we would assign the objects in the
filesystem the same label as the inode for the backing device
passed to mount. Unfortunately it's currently impossible to
determine which inode this is from the LSM mount hooks, so we
settle for the label of the process doing the mount.

This label is assigned to s_root, and also to smk_default to
ensure that new inodes receive this label. The transmute property
is also set on s_root to make this behavior more explicit, even
though it is technically not necessary.

If a filesystem has existing security labels, access to inodes is
permitted if the label is the same as smk_root, otherwise access
is denied. The SMACK64EXEC xattr is completely ignored.

Explicit setting of security labels continues to require
CAP_MAC_ADMIN in init_user_ns.

Altogether, this ensures that filesystem objects are not
accessible to subjects which cannot already access the backing
store, that MAC is not violated for any objects in the fileystem
which are already labeled, and that a user cannot use an
unprivileged mount to gain elevated MAC privileges.

sysfs, tmpfs, and ramfs are already mountable from user
namespaces and support security labels. We can't rule out the
possibility that these filesystems may already be used in mounts
from user namespaces with security lables set from the init
namespace, so failing to trust lables in these filesystems may
introduce regressions. It is safe to trust labels from these
filesystems, since the unprivileged user does not control the
backing store and thus cannot supply security labels, so an
explicit exception is made to trust labels from these
filesystems.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/security/smack/smack.h b/security/smack/smack.h
index 6c91156..26e58f1 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -90,9 +90,15 @@ struct superblock_smack {
 	struct smack_known	*smk_floor;
 	struct smack_known	*smk_hat;
 	struct smack_known	*smk_default;
-	int			smk_initialized;
+	int			smk_flags;
 };
 
+/*
+ * Superblock flags
+ */
+#define SMK_SB_INITIALIZED	0x01
+#define SMK_SB_UNTRUSTED	0x02
+
 struct socket_smack {
 	struct smack_known	*smk_out;	/* outbound label */
 	struct smack_known	*smk_in;	/* inbound label */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 6777295..a799b7a 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -549,7 +549,7 @@ static int smack_sb_alloc_security(struct super_block *sb)
 	sbsp->smk_floor = &smack_known_floor;
 	sbsp->smk_hat = &smack_known_hat;
 	/*
-	 * smk_initialized will be zero from kzalloc.
+	 * SMK_SB_INITIALIZED will be zero from kzalloc.
 	 */
 	sb->s_security = sbsp;
 
@@ -766,10 +766,10 @@ static int smack_set_mnt_opts(struct super_block *sb,
 	int num_opts = opts->num_mnt_opts;
 	int transmute = 0;
 
-	if (sp->smk_initialized)
+	if (sp->smk_flags & SMK_SB_INITIALIZED)
 		return 0;
 
-	sp->smk_initialized = 1;
+	sp->smk_flags |= SMK_SB_INITIALIZED;
 
 	for (i = 0; i < num_opts; i++) {
 		switch (opts->mnt_opts_flags[i]) {
@@ -821,6 +821,17 @@ static int smack_set_mnt_opts(struct super_block *sb,
 		skp = smk_of_current();
 		sp->smk_root = skp;
 		sp->smk_default = skp;
+		/*
+		 * For a handful of fs types with no user-controlled
+		 * backing store it's okay to trust security labels
+		 * in the filesystem. The rest are untrusted.
+		 */
+		if (sb->s_user_ns != &init_user_ns &&
+		    sb->s_magic != SYSFS_MAGIC && sb->s_magic != TMPFS_MAGIC &&
+		    sb->s_magic != RAMFS_MAGIC) {
+			transmute = 1;
+			sp->smk_flags |= SMK_SB_UNTRUSTED;
+		}
 	}
 
 	/*
@@ -1203,6 +1214,7 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
+	struct superblock_smack *sbsp = inode->i_sb->s_security;
 	struct smk_audit_info ad;
 	int no_block = mask & MAY_NOT_BLOCK;
 	int rc;
@@ -1214,6 +1226,11 @@ static int smack_inode_permission(struct inode *inode, int mask)
 	if (mask == 0)
 		return 0;
 
+	if (sbsp->smk_flags & SMK_SB_UNTRUSTED) {
+		if (smk_of_inode(inode) != sbsp->smk_root)
+			return -EACCES;
+	}
+
 	/* May be droppable after audit */
 	if (no_block)
 		return -ECHILD;
@@ -3529,14 +3546,16 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			if (rc >= 0)
 				transflag = SMK_INODE_TRANSMUTE;
 		}
-		/*
-		 * Don't let the exec or mmap label be "*" or "@".
-		 */
-		skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
-		if (IS_ERR(skp) || skp == &smack_known_star ||
-		    skp == &smack_known_web)
-			skp = NULL;
-		isp->smk_task = skp;
+		if (!(sbsp->smk_flags & SMK_SB_UNTRUSTED)) {
+			/*
+			 * Don't let the exec or mmap label be "*" or "@".
+			 */
+			skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
+			if (IS_ERR(skp) || skp == &smack_known_star ||
+			    skp == &smack_known_web)
+				skp = NULL;
+			isp->smk_task = skp;
+		}
 
 		skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp);
 		if (IS_ERR(skp) || skp == &smack_known_star ||
-- 
cgit v0.10.2


From 809c02e091a8272bc8586a5d606565bc900f3467 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:22 -0500
Subject: Smack: Handle labels consistently in untrusted mounts

The SMACK64, SMACK64EXEC, and SMACK64MMAP labels are all handled
differently in untrusted mounts. This is confusing and
potentically problematic. Change this to handle them all the same
way that SMACK64 is currently handled; that is, read the label
from disk and check it at use time. For SMACK64 and SMACK64MMAP
access is denied if the label does not match smk_root. To be
consistent with suid, a SMACK64EXEC label which does not match
smk_root will still allow execution of the file but will not run
with the label supplied in the xattr.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index a799b7a..b75634d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -919,6 +919,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	struct inode *inode = file_inode(bprm->file);
 	struct task_smack *bsp = bprm->cred->security;
 	struct inode_smack *isp;
+	struct superblock_smack *sbsp;
 	int rc;
 
 	if (bprm->cred_prepared)
@@ -928,6 +929,11 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
 		return 0;
 
+	sbsp = inode->i_sb->s_security;
+	if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
+	    isp->smk_task != sbsp->smk_root)
+		return 0;
+
 	if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
 		struct task_struct *tracer;
 		rc = 0;
@@ -1725,6 +1731,7 @@ static int smack_mmap_file(struct file *file,
 	struct task_smack *tsp;
 	struct smack_known *okp;
 	struct inode_smack *isp;
+	struct superblock_smack *sbsp;
 	int may;
 	int mmay;
 	int tmay;
@@ -1736,6 +1743,10 @@ static int smack_mmap_file(struct file *file,
 	isp = file_inode(file)->i_security;
 	if (isp->smk_mmap == NULL)
 		return 0;
+	sbsp = file_inode(file)->i_sb->s_security;
+	if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
+	    isp->smk_mmap != sbsp->smk_root)
+		return -EACCES;
 	mkp = isp->smk_mmap;
 
 	tsp = current_security();
@@ -3546,16 +3557,14 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 			if (rc >= 0)
 				transflag = SMK_INODE_TRANSMUTE;
 		}
-		if (!(sbsp->smk_flags & SMK_SB_UNTRUSTED)) {
-			/*
-			 * Don't let the exec or mmap label be "*" or "@".
-			 */
-			skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
-			if (IS_ERR(skp) || skp == &smack_known_star ||
-			    skp == &smack_known_web)
-				skp = NULL;
-			isp->smk_task = skp;
-		}
+		/*
+		 * Don't let the exec or mmap label be "*" or "@".
+		 */
+		skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
+		if (IS_ERR(skp) || skp == &smack_known_star ||
+		    skp == &smack_known_web)
+			skp = NULL;
+		isp->smk_task = skp;
 
 		skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp);
 		if (IS_ERR(skp) || skp == &smack_known_star ||
-- 
cgit v0.10.2


From aad82892af261b9903cc11c55be3ecf5f0b0b4f8 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:20 -0500
Subject: selinux: Add support for unprivileged mounts from user namespaces

Security labels from unprivileged mounts in user namespaces must
be ignored. Force superblocks from user namespaces whose labeling
behavior is to use xattrs to use mountpoint labeling instead.
For the mountpoint label, default to converting the current task
context into a form suitable for file objects, but also allow the
policy writer to specify a different label through policy
transition rules.

Pieced together from code snippets provided by Stephen Smalley.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 1554175..19be9d3 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -830,6 +830,28 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 			goto out;
 		}
 	}
+
+	/*
+	 * If this is a user namespace mount, no contexts are allowed
+	 * on the command line and security labels must be ignored.
+	 */
+	if (sb->s_user_ns != &init_user_ns) {
+		if (context_sid || fscontext_sid || rootcontext_sid ||
+		    defcontext_sid) {
+			rc = -EACCES;
+			goto out;
+		}
+		if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
+			sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
+			rc = security_transition_sid(current_sid(), current_sid(),
+						     SECCLASS_FILE, NULL,
+						     &sbsec->mntpoint_sid);
+			if (rc)
+				goto out;
+		}
+		goto out_set_opts;
+	}
+
 	/* sets the context of the superblock for the fs being mounted. */
 	if (fscontext_sid) {
 		rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
@@ -898,6 +920,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 		sbsec->def_sid = defcontext_sid;
 	}
 
+out_set_opts:
 	rc = sb_finish_set_opts(sb);
 out:
 	mutex_unlock(&sbsec->lock);
-- 
cgit v0.10.2


From a475acf01f79e89a1a5845733e10108d80f77188 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:25 -0500
Subject: fs: Refuse uid/gid changes which don't map into s_user_ns

Add checks to notify_change to verify that uid and gid changes
will map into the superblock's user namespace. If they do not
fail with -EOVERFLOW.

This is mandatory so that fileystems don't have to even think
of dealing with ia_uid and ia_gid that

--EWB Moved the test from inode_change_ok to notify_change

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/fs/attr.c b/fs/attr.c
index 25b24d0..dd72357 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -255,6 +255,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
 		return 0;
 
+	/*
+	 * Verify that uid/gid changes are valid in the target
+	 * namespace of the superblock.
+	 */
+	if (ia_valid & ATTR_UID &&
+	    !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+		return -EOVERFLOW;
+	if (ia_valid & ATTR_GID &&
+	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+		return -EOVERFLOW;
+
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
-- 
cgit v0.10.2


From 37b11804ed1725dc2ea97be2236150210a69e9d5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 27 Jun 2016 15:52:48 -0500
Subject: userns: Handle -1 in k[ug]id_has_mapping when !CONFIG_USER_NS

Refuse to admit any user namespace has a mapping of the INVALID_UID
and the INVALID_GID when !CONFIG_USER_NS.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 0383552..25e9d92 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -177,12 +177,12 @@ static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid)
 
 static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
 {
-	return true;
+	return uid_valid(uid);
 }
 
 static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
 {
-	return true;
+	return gid_valid(gid);
 }
 
 #endif /* CONFIG_USER_NS */
-- 
cgit v0.10.2


From 0d4d717f25834134bb6f43284f84c8ccee5bbf2a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 27 Jun 2016 16:04:06 -0500
Subject: vfs: Verify acls are valid within superblock's s_user_ns.

Update posix_acl_valid to verify that an acl is within a user namespace.

Update the callers of posix_acl_valid to pass in an appropriate
user namespace.  For posix_acl_xattr_set and v9fs_xattr_set_acl pass in
inode->i_sb->s_user_ns to posix_acl_valid.  For md_unpack_acl pass in
&init_user_ns as no inode or superblock is in sight.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 86b7445..40cf57f 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -434,7 +434,7 @@ static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md)
 		return rc;
 	}
 
-	rc = posix_acl_valid(acl);
+	rc = posix_acl_valid(&init_user_ns, acl);
 	if (rc) {
 		CERROR("validate acl: %d\n", rc);
 		posix_acl_release(acl);
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 0576eae..5b6a174 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 		else if (acl) {
-			retval = posix_acl_valid(acl);
+			retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
 			if (retval)
 				goto err_out;
 		}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8a4a266..647c281 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
  * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
  */
 int
-posix_acl_valid(const struct posix_acl *acl)
+posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
 {
 	const struct posix_acl_entry *pa, *pe;
 	int state = ACL_USER_OBJ;
@@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl)
 			case ACL_USER:
 				if (state != ACL_USER)
 					return -EINVAL;
-				if (!uid_valid(pa->e_uid))
+				if (!kuid_has_mapping(user_ns, pa->e_uid))
 					return -EINVAL;
 				needs_mask = 1;
 				break;
@@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl)
 			case ACL_GROUP:
 				if (state != ACL_GROUP)
 					return -EINVAL;
-				if (!gid_valid(pa->e_gid))
+				if (!kgid_has_mapping(user_ns, pa->e_gid))
 					return -EINVAL;
 				needs_mask = 1;
 				break;
@@ -845,7 +845,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 			return PTR_ERR(acl);
 
 		if (acl) {
-			ret = posix_acl_valid(acl);
+			ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
 			if (ret)
 				goto out;
 		}
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 5b5a80c..1af6438f 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -81,7 +81,7 @@ posix_acl_release(struct posix_acl *acl)
 
 extern void posix_acl_init(struct posix_acl *, int);
 extern struct posix_acl *posix_acl_alloc(int, gfp_t);
-extern int posix_acl_valid(const struct posix_acl *);
+extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
 extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
 extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
-- 
cgit v0.10.2


From 2d7f9e2ad35e4e7a3086231f19bfab33c6a8a64a Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:23 -0500
Subject: fs: Check for invalid i_uid in may_follow_link()

Filesystem uids which don't map into a user namespace may result
in inode->i_uid being INVALID_UID. A symlink and its parent
could have different owners in the filesystem can both get
mapped to INVALID_UID, which may result in following a symlink
when this would not have otherwise been permitted when protected
symlinks are enabled.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/fs/namei.c b/fs/namei.c
index 757a3272..8701bd9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -901,6 +901,7 @@ static inline int may_follow_link(struct nameidata *nd)
 {
 	const struct inode *inode;
 	const struct inode *parent;
+	kuid_t puid;
 
 	if (!sysctl_protected_symlinks)
 		return 0;
@@ -916,7 +917,8 @@ static inline int may_follow_link(struct nameidata *nd)
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	if (uid_eq(parent->i_uid, inode->i_uid))
+	puid = parent->i_uid;
+	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
-- 
cgit v0.10.2


From 5f65e5ca286126a60f62c8421b77c2018a482b8a Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 26 Apr 2016 14:36:24 -0500
Subject: cred: Reject inodes with invalid ids in set_create_file_as()

Using INVALID_[UG]ID for the LSM file creation context doesn't
make sense, so return an error if the inode passed to
set_create_file_as() has an invalid id.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/kernel/cred.c b/kernel/cred.c
index 0c0cd8a..5f264fb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
  */
 int set_create_files_as(struct cred *new, struct inode *inode)
 {
+	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+		return -EINVAL;
 	new->fsuid = inode->i_uid;
 	new->fsgid = inode->i_gid;
 	return security_kernel_create_files_as(new, inode);
-- 
cgit v0.10.2


From 0bd23d09b874e53bd1a2fe2296030aa2720d7b08 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 29 Jun 2016 14:54:46 -0500
Subject: vfs: Don't modify inodes with a uid or gid unknown to the vfs

When a filesystem outside of init_user_ns is mounted it could have
uids and gids stored in it that do not map to init_user_ns.

The plan is to allow those filesystems to set i_uid to INVALID_UID and
i_gid to INVALID_GID for unmapped uids and gids and then to handle
that strange case in the vfs to ensure there is consistent robust
handling of the weirdness.

Upon a careful review of the vfs and filesystems about the only case
where there is any possibility of confusion or trouble is when the
inode is written back to disk.  In that case filesystems typically
read the inode->i_uid and inode->i_gid and write them to disk even
when just an inode timestamp is being updated.

Which leads to a rule that is very simple to implement and understand
inodes whose i_uid or i_gid is not valid may not be written.

In dealing with access times this means treat those inodes as if the
inode flag S_NOATIME was set.  Reads of the inodes appear safe and
useful, but any write or modification is disallowed.  The only inode
write that is allowed is a chown that sets the uid and gid on the
inode to valid values.  After such a chown the inode is normal and may
be treated as such.

Denying all writes to inodes with uids or gids unknown to the vfs also
prevents several oddball cases where corruption would have occurred
because the vfs does not have complete information.

One problem case that is prevented is attempting to use the gid of a
directory for new inodes where the directories sgid bit is set but the
directories gid is not mapped.

Another problem case avoided is attempting to update the evm hash
after setxattr, removexattr, and setattr.  As the evm hash includeds
the inode->i_uid or inode->i_gid not knowning the uid or gid prevents
a correct evm hash from being computed.  evm hash verification also
fails when i_uid or i_gid is unknown but that is essentially harmless
as it does not cause filesystem corruption.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/attr.c b/fs/attr.c
index dd72357..42bb42b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -266,6 +266,14 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
 		return -EOVERFLOW;
 
+	/* Don't allow modifications of files with invalid uids or
+	 * gids unless those uids & gids are being made valid.
+	 */
+	if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+		return -EOVERFLOW;
+	if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+		return -EOVERFLOW;
+
 	error = security_inode_setattr(dentry, attr);
 	if (error)
 		return error;
diff --git a/fs/inode.c b/fs/inode.c
index 4ccbc21..c0ebb97 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1617,6 +1617,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
 
 	if (inode->i_flags & S_NOATIME)
 		return false;
+
+	/* Atime updates will likely cause i_uid and i_gid to be written
+	 * back improprely if their true value is unknown to the vfs.
+	 */
+	if (HAS_UNMAPPED_ID(inode))
+		return false;
+
 	if (IS_NOATIME(inode))
 		return false;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
diff --git a/fs/namei.c b/fs/namei.c
index 8701bd9..840201c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -410,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask)
 		 */
 		if (IS_IMMUTABLE(inode))
 			return -EACCES;
+
+		/*
+		 * Updating mtime will likely cause i_uid and i_gid to be
+		 * written back improperly if their true value is unknown
+		 * to the vfs.
+		 */
+		if (HAS_UNMAPPED_ID(inode))
+			return -EACCES;
 	}
 
 	retval = do_inode_permission(inode, mask);
@@ -2759,10 +2767,11 @@ EXPORT_SYMBOL(__check_sticky);
  *	c. have CAP_FOWNER capability
  *  6. If the victim is append-only or immutable we can't do antyhing with
  *     links pointing to it.
- *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- *  9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *  7. If the victim has an unknown uid or gid we can't change the inode.
+ *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
 static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
@@ -2784,7 +2793,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 		return -EPERM;
 
 	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -4190,6 +4199,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 */
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
+	/*
+	 * Updating the link count will likely cause i_uid and i_gid to
+	 * be writen back improperly if their true value is unknown to
+	 * the vfs.
+	 */
+	if (HAS_UNMAPPED_ID(inode))
+		return -EPERM;
 	if (!dir->i_op->link)
 		return -EPERM;
 	if (S_ISDIR(inode->i_mode))
diff --git a/fs/xattr.c b/fs/xattr.c
index 4beafc4..c243905 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	if (mask & MAY_WRITE) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
 			return -EPERM;
+		/*
+		 * Updating an xattr will likely cause i_uid and i_gid
+		 * to be writen back improperly if their true value is
+		 * unknown to the vfs.
+		 */
+		if (HAS_UNMAPPED_ID(inode))
+			return -EPERM;
 	}
 
 	/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 375e37f..cb25ceb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1874,6 +1874,11 @@ struct super_operations {
 #define IS_WHITEOUT(inode)	(S_ISCHR(inode->i_mode) && \
 				 (inode)->i_rdev == WHITEOUT_DEV)
 
+static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+{
+	return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+}
+
 /*
  * Inode state bits.  Protected by inode->i_lock
  *
-- 
cgit v0.10.2


From 036d523641c66bef713042894a17f4335f199e49 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 1 Jul 2016 12:52:06 -0500
Subject: vfs: Don't create inodes with a uid or gid unknown to the vfs

It is expected that filesystems can not represent uids and gids from
outside of their user namespace.  Keep things simple by not even
trying to create filesystem nodes with non-sense uids and gids.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namei.c b/fs/namei.c
index 840201c..629823f1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2814,16 +2814,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
  *  1. We can't do it if child already exists (open has special treatment for
  *     this case, but since we are inlined it's OK)
  *  2. We can't do it if dir is read-only (done in permission())
- *  3. We should have write and exec permissions on dir
- *  4. We can't do it if dir is immutable (done in permission())
+ *  3. We can't do it if the fs can't represent the fsuid or fsgid.
+ *  4. We should have write and exec permissions on dir
+ *  5. We can't do it if dir is immutable (done in permission())
  */
 static inline int may_create(struct inode *dir, struct dentry *child)
 {
+	struct user_namespace *s_user_ns;
 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
+	s_user_ns = dir->i_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+		return -EOVERFLOW;
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-- 
cgit v0.10.2


From d49d37624a1931c2f3b5d0cbe95bd5181cbdc279 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 30 Jun 2016 16:31:01 -0500
Subject: quota: Ensure qids map to the filesystem

Introduce the helper qid_has_mapping and use it to ensure that the
quota system only considers qids that map to the filesystems
s_user_ns.

In practice for quota supporting filesystems today this is the exact
same check as qid_valid.  As only 0xffffffff aka (qid_t)-1 does not
map into init_user_ns.

Replace the qid_valid calls with qid_has_mapping as values come in
from userspace.  This is harmless today and it prepares the quota
system to work on filesystems with quotas but mounted by unprivileged
users.

Call qid_has_mapping from dqget.  This ensures the passed in qid has a
prepresentation on the underlying filesystem.  Previously this was
unnecessary as filesystesm never had qids that could not map.  With
the introduction of filesystems outside of s_user_ns this will not
remain true.

All of this ensures the quota code never has to deal with qids that
don't map to the underlying filesystem.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ff21980..74706b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid)
 	unsigned int hashent = hashfn(sb, qid);
 	struct dquot *dquot, *empty = NULL;
 
+	if (!qid_has_mapping(sb->s_user_ns, qid))
+		return ERR_PTR(-EINVAL);
+
         if (!sb_has_quota_active(sb, qid.type))
 		return ERR_PTR(-ESRCH);
 we_slept:
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0f10ee9..73f6f4c 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
 	if (ret)
@@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_nextdqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
 	if (ret)
@@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->set_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	copy_from_if_dqblk(&fdq, &idq);
 	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
@@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->set_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	/* Are we actually setting timer / warning limits for all users? */
 	if (from_kqid(&init_user_ns, qid) == 0 &&
@@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_dqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
 	if (ret)
@@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
 	if (!sb->s_qcop->get_nextdqblk)
 		return -ENOSYS;
 	qid = make_kqid(current_user_ns(), type, id);
-	if (!qid_valid(qid))
+	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
 	if (ret)
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 9dfb6bc..1db16ee 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -179,6 +179,16 @@ static inline struct kqid make_kqid_projid(kprojid_t projid)
 	return kqid;
 }
 
+/**
+ *	qid_has_mapping - Report if a qid maps into a user namespace.
+ *	@ns:  The user namespace to see if a value maps into.
+ *	@qid: The kernel internal quota identifier to test.
+ */
+static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid)
+{
+	return from_kqid(ns, qid) != (qid_t) -1;
+}
+
 
 extern spinlock_t dq_data_lock;
 
-- 
cgit v0.10.2


From cfd4c70a18c4e806aaac2f483153dae01e0ace1c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 5 Jul 2016 11:10:57 -0500
Subject: quota: Handle quota data stored in s_user_ns in quota_setxquota

In Q_XSETQLIMIT use sb->s_user_ns to detect when we are dealing with
the filesystems notion of id 0.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Inspired-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 73f6f4c..35df08e 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -584,7 +584,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	if (!qid_has_mapping(sb->s_user_ns, qid))
 		return -EINVAL;
 	/* Are we actually setting timer / warning limits for all users? */
-	if (from_kqid(&init_user_ns, qid) == 0 &&
+	if (from_kqid(sb->s_user_ns, qid) == 0 &&
 	    fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
 		struct qc_info qinfo;
 		int ret;
-- 
cgit v0.10.2


From 5c0048280babd579fa9e5f0e787122b06aee3f3b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 5 Jul 2016 10:41:57 -0500
Subject: dquot: For now explicitly don't support filesystems outside of
 init_user_ns

Mostly supporting filesystems outside of init_user_ns is
s/&init_usre_ns/dquot->dq_sb->s_user_ns/.  An actual need for
supporting quotas on filesystems outside of s_user_ns is quite a ways
away and to be done responsibily needs an audit on what can happen
with hostile quota files.  Until that audit is complete don't attempt
to support quota files on filesystems outside of s_user_ns.

Cc: Jan Kara <jack@suse.cz>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 74706b6..87197d1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2271,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		error = -EINVAL;
 		goto out_fmt;
 	}
+	/* Filesystems outside of init_user_ns not yet supported */
+	if (sb->s_user_ns != &init_user_ns) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
 	/* Usage always has to be set... */
 	if (!(flags & DQUOT_USAGE_ENABLED)) {
 		error = -EINVAL;
-- 
cgit v0.10.2


From 0b3c9761d1e405514a551ed24d3ea89aea26ce14 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Thu, 5 Feb 2015 10:44:50 -0600
Subject: evm: Translate user/group ids relative to s_user_ns when computing
 HMAC

The EVM HMAC should be calculated using the on disk user and
group ids, so the k[ug]ids in the inode must be translated
relative to the s_user_ns of the inode's super block.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 30b6b7d0..11c1d30 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -151,8 +151,8 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode,
 	memset(&hmac_misc, 0, sizeof(hmac_misc));
 	hmac_misc.ino = inode->i_ino;
 	hmac_misc.generation = inode->i_generation;
-	hmac_misc.uid = from_kuid(&init_user_ns, inode->i_uid);
-	hmac_misc.gid = from_kgid(&init_user_ns, inode->i_gid);
+	hmac_misc.uid = from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+	hmac_misc.gid = from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
 	hmac_misc.mode = inode->i_mode;
 	crypto_shash_update(desc, (const u8 *)&hmac_misc, sizeof(hmac_misc));
 	if (evm_hmac_attrs & EVM_ATTR_FSUUID)
-- 
cgit v0.10.2


From 81754357770ebd900801231e7bc8d151ddc00498 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Thu, 11 Dec 2014 10:15:45 -0600
Subject: fs: Update i_[ug]id_(read|write) to translate relative to s_user_ns

For filesystems mounted from a user namespace on-disk ids should
be translated relative to s_users_ns rather than init_user_ns.

When an id in the filesystem doesn't exist in s_user_ns the
associated id in the inode will be set to INVALID_[UG]ID, which
turns these into de facto "nobody" ids. This actually maps pretty
well into the way most code already works, and those places where
it didn't were fixed in previous patches. Moving forward vfs code
needs to be careful to handle instances where ids in inodes may
be invalid.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cb25ceb..8aa9b72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -831,31 +831,6 @@ static inline void i_size_write(struct inode *inode, loff_t i_size)
 #endif
 }
 
-/* Helper functions so that in most cases filesystems will
- * not need to deal directly with kuid_t and kgid_t and can
- * instead deal with the raw numeric values that are stored
- * in the filesystem.
- */
-static inline uid_t i_uid_read(const struct inode *inode)
-{
-	return from_kuid(&init_user_ns, inode->i_uid);
-}
-
-static inline gid_t i_gid_read(const struct inode *inode)
-{
-	return from_kgid(&init_user_ns, inode->i_gid);
-}
-
-static inline void i_uid_write(struct inode *inode, uid_t uid)
-{
-	inode->i_uid = make_kuid(&init_user_ns, uid);
-}
-
-static inline void i_gid_write(struct inode *inode, gid_t gid)
-{
-	inode->i_gid = make_kgid(&init_user_ns, gid);
-}
-
 static inline unsigned iminor(const struct inode *inode)
 {
 	return MINOR(inode->i_rdev);
@@ -1461,6 +1436,31 @@ struct super_block {
 	struct list_head	s_inodes;	/* all inodes */
 };
 
+/* Helper functions so that in most cases filesystems will
+ * not need to deal directly with kuid_t and kgid_t and can
+ * instead deal with the raw numeric values that are stored
+ * in the filesystem.
+ */
+static inline uid_t i_uid_read(const struct inode *inode)
+{
+	return from_kuid(inode->i_sb->s_user_ns, inode->i_uid);
+}
+
+static inline gid_t i_gid_read(const struct inode *inode)
+{
+	return from_kgid(inode->i_sb->s_user_ns, inode->i_gid);
+}
+
+static inline void i_uid_write(struct inode *inode, uid_t uid)
+{
+	inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid);
+}
+
+static inline void i_gid_write(struct inode *inode, gid_t gid)
+{
+	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
+}
+
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*
-- 
cgit v0.10.2


From aeaa4a79ff6a5ed912b7362f206cf8576fca538b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 23 Jul 2016 11:20:44 -0500
Subject: fs: Call d_automount with the filesystems creds

Seth Forshee reported a mount regression in nfs autmounts
with "fs: Add user namespace member to struct super_block".

It turns out that the assumption that current->cred is something
reasonable during mount while necessary to improve support of
unprivileged mounts is wrong in the automount path.

To fix the existing filesystems override current->cred with the
init_cred before calling d_automount and restore current->cred after
d_automount completes.

To support unprivileged mounts would require a more nuanced cred
selection, so fail on unprivileged mounts for the time being.  As none
of the filesystems that currently set FS_USERNS_MOUNT implement
d_automount this check is only good for preventing future problems.

Fixes: 6e4eab577a0c ("fs: Add user namespace member to struct super_block")
Tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

diff --git a/fs/namei.c b/fs/namei.c
index 629823f1..ef573df 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -36,6 +36,7 @@
 #include <linux/posix_acl.h>
 #include <linux/hash.h>
 #include <linux/bitops.h>
+#include <linux/init_task.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -1099,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
+	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1120,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	    path->dentry->d_inode)
 		return -EISDIR;
 
+	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+		return -EACCES;
+
 	nd->total_link_count++;
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
+	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
+	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
-- 
cgit v0.10.2