Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next

Pull networking updates from David Miller: Changes of note: 1) Allow to schedule ICMP packets in IPVS, from Alex Gartrell. 2) Provide FIB table ID in ipv4 route dumps just as ipv6 does, from David Ahern. 3) Allow the user to ask for the statistics to be filtered out of ipv4/ipv6 address netlink dumps. From Sowmini Varadhan. 4) More work to pass the network namespace context around deep into various packet path APIs, starting with the netfilter hooks. From Eric W Biederman. 5) Add layer 2 TX/RX checksum offloading to qeth driver, from Thomas Richter. 6) Use usec resolution for SYN/ACK RTTs in TCP, from Yuchung Cheng. 7) Support Very High Throughput in wireless MESH code, from Bob Copeland. 8) Allow setting the ageing_time in switchdev/rocker. From Scott Feldman. 9) Properly autoload L2TP type modules, from Stephen Hemminger. 10) Fix and enable offload features by default in 8139cp driver, from David Woodhouse. 11) Support both ipv4 and ipv6 sockets in a single vxlan device, from Jiri Benc. 12) Fix CWND limiting of thin streams in TCP, from Bendik Rønning Opstad. 13) Fix IPSEC flowcache overflows on large systems, from Steffen Klassert. 14) Convert bridging to track VLANs using rhashtable entries rather than a bitmap. From Nikolay Aleksandrov. 15) Make TCP listener handling completely lockless, this is a major accomplishment. Incoming request sockets now live in the established hash table just like any other socket too. From Eric Dumazet. 15) Provide more bridging attributes to netlink, from Nikolay Aleksandrov. 16) Use hash based algorithm for ipv4 multipath routing, this was very long overdue. From Peter Nørlund. 17) Several y2038 cures, mostly avoiding timespec. From Arnd Bergmann. 18) Allow non-root execution of EBPF programs, from Alexei Starovoitov. 19) Support SO_INCOMING_CPU as setsockopt, from Eric Dumazet. This influences the port binding selection logic used by SO_REUSEPORT. 20) Add ipv6 support to VRF, from David Ahern. 21) Add support for Mellanox Spectrum switch ASIC, from Jiri Pirko. 22) Add rtl8xxxu Realtek wireless driver, from Jes Sorensen. 23) Implement RACK loss recovery in TCP, from Yuchung Cheng. 24) Support multipath routes in MPLS, from Roopa Prabhu. 25) Fix POLLOUT notification for listening sockets in AF_UNIX, from Eric Dumazet. 26) Add new QED Qlogic river, from Yuval Mintz, Manish Chopra, and Sudarsana Kalluru. 27) Don't fetch timestamps on AF_UNIX sockets, from Hannes Frederic Sowa. 28) Support ipv6 geneve tunnels, from John W Linville. 29) Add flood control support to switchdev layer, from Ido Schimmel. 30) Fix CHECKSUM_PARTIAL handling of potentially fragmented frames, from Hannes Frederic Sowa. 31) Support persistent maps and progs in bpf, from Daniel Borkmann. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1790 commits) sh_eth: use DMA barriers switchdev: respect SKIP_EOPNOTSUPP flag in case there is no recursion net: sched: kill dead code in sch_choke.c irda: Delete an unnecessary check before the function call "irlmp_unregister_service" net: dsa: mv88e6xxx: include DSA ports in VLANs net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports net/core: fix for_each_netdev_feature vlan: Invoke driver vlan hooks only if device is present arcnet/com20020: add LEDS_CLASS dependency bpf, verifier: annotate verbose printer with __printf dp83640: Only wait for timestamps for packets with timestamping enabled. ptp: Change ptp_class to a proper bitmask dp83640: Prune rx timestamp list before reading from it dp83640: Delay scheduled work. dp83640: Include hash in timestamp/packet matching ipv6: fix tunnel error handling net/mlx5e: Fix LSO vlan insertion net/mlx5e: Re-eanble client vlan TX acceleration net/mlx5e: Return error in case mlx5e_set_features() fails net/mlx5e: Don't allow more than max supported channels ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-04 17:41:05 (GMT)
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-11-04 17:41:05 (GMT)
commit: b0f85fa11aefc4f3e03306b4cd47f113bd57dcba (patch)
tree: 1333d36d99fde3f97210795941fc246f0ad08a75 /kernel
parent: ccc9d4a6d640cbde05d519edeb727881646cf71b (diff)
parent: f32bfb9a8ca083f8d148ea90ae5ba66f4831836e (diff)
download: linux-b0f85fa11aefc4f3e03306b4cd47f113bd57dcba.tar.xz
14 files changed, 856 insertions, 84 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be..1327258 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,4 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace10..3f4c99e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
-
+	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 	array->elem_size = elem_size;
 
 	return &array->map;
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 
 	attr = perf_event_attrs(event);
 	if (IS_ERR(attr))
-		return (void *)attr;
+		goto err;
 
-	if (attr->type != PERF_TYPE_RAW &&
-	    attr->type != PERF_TYPE_HARDWARE) {
-		perf_event_release_kernel(event);
-		return ERR_PTR(-EINVAL);
-	}
-	return event;
+	if (attr->inherit)
+		goto err;
+
+	if (attr->type == PERF_TYPE_RAW)
+		return event;
+
+	if (attr->type == PERF_TYPE_HARDWARE)
+		return event;
+
+	if (attr->type == PERF_TYPE_SOFTWARE &&
+	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+		return event;
+err:
+	perf_event_release_kernel(event);
+	return ERR_PTR(-EINVAL);
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380c..334b1bd 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
+	kmemcheck_annotate_bitfield(fp, meta);
+
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
+	fp->aux->prog = fp;
 
 	return fp;
 }
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = size / PAGE_SIZE;
+		fp->aux->prog = fp;
 
 		/* We keep fp->aux from fp_old around in the new
 		 * reallocated structure.
@@ -722,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp)
 	struct bpf_prog_aux *aux = fp->aux;
 
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
-	aux->prog = fp;
 	schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
 
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+	prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* Should someone ever have the rather unwise idea to use some
+	 * of the registers passed into this function, then note that
+	 * this function is called from native eBPF and classic-to-eBPF
+	 * transformations. Register assignments from both sides are
+	 * different, f.e. classic always sets fn(ctx, A, X) here.
+	 */
+	struct rnd_state *state;
+	u32 res;
+
+	state = &get_cpu_var(bpf_user_rnd_state);
+	res = prandom_u32_state(state);
+	put_cpu_var(state);
+
+	return res;
+}
+
 /* Weak definitions of helper functions in case we don't have bpf syscall. */
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d..19909b2 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
 struct bpf_htab {
 	struct bpf_map map;
 	struct hlist_head *buckets;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	u32 count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	for (i = 0; i < htab->n_buckets; i++)
 		INIT_HLIST_HEAD(&htab->buckets[i]);
 
-	spin_lock_init(&htab->lock);
+	raw_spin_lock_init(&htab->lock);
 	htab->count = 0;
 
 	htab->elem_size = sizeof(struct htab_elem) +
 			  round_up(htab->map.key_size, 8) +
 			  htab->map.value_size;
+
+	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+				   htab->elem_size * htab->map.max_entries,
+				   PAGE_SIZE) >> PAGE_SHIFT;
 	return &htab->map;
 
 free_htab:
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_new->hash = htab_map_hash(l_new->key, key_size);
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&htab->lock, flags);
 
 	head = select_bucket(htab, l_new->hash);
 
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	} else {
 		htab->count++;
 	}
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 
 	return 0;
 err:
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 	kfree(l_new);
 	return ret;
 }
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	hash = htab_map_hash(key, key_size);
 
-	spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&htab->lock, flags);
 
 	head = select_bucket(htab, hash);
 
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&htab->lock, flags);
 	return ret;
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..4504ca6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return prandom_u32();
-}
-
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
-	.func		= bpf_get_prandom_u32,
+	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 0000000..be6d726
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ *	Daniel Borkmann <daniel@iogearbox.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+enum bpf_type {
+	BPF_TYPE_UNSPEC	= 0,
+	BPF_TYPE_PROG,
+	BPF_TYPE_MAP,
+};
+
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+	switch (type) {
+	case BPF_TYPE_PROG:
+		atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+		break;
+	case BPF_TYPE_MAP:
+		atomic_inc(&((struct bpf_map *)raw)->refcnt);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	return raw;
+}
+
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+	switch (type) {
+	case BPF_TYPE_PROG:
+		bpf_prog_put(raw);
+		break;
+	case BPF_TYPE_MAP:
+		bpf_map_put(raw);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+}
+
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+	void *raw;
+
+	*type = BPF_TYPE_MAP;
+	raw = bpf_map_get(ufd);
+	if (IS_ERR(raw)) {
+		*type = BPF_TYPE_PROG;
+		raw = bpf_prog_get(ufd);
+	}
+
+	return raw;
+}
+
+static const struct inode_operations bpf_dir_iops;
+
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops  = { };
+
+static struct inode *bpf_get_inode(struct super_block *sb,
+				   const struct inode *dir,
+				   umode_t mode)
+{
+	struct inode *inode;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+	case S_IFREG:
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOSPC);
+
+	inode->i_ino = get_next_ino();
+	inode->i_atime = CURRENT_TIME;
+	inode->i_mtime = inode->i_atime;
+	inode->i_ctime = inode->i_atime;
+
+	inode_init_owner(inode, dir, mode);
+
+	return inode;
+}
+
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+	*type = BPF_TYPE_UNSPEC;
+	if (inode->i_op == &bpf_prog_iops)
+		*type = BPF_TYPE_PROG;
+	else if (inode->i_op == &bpf_map_iops)
+		*type = BPF_TYPE_MAP;
+	else
+		return -EACCES;
+
+	return 0;
+}
+
+static bool bpf_dname_reserved(const struct dentry *dentry)
+{
+	return strchr(dentry->d_name.name, '.');
+}
+
+static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+
+	if (bpf_dname_reserved(dentry))
+		return -EPERM;
+
+	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &bpf_dir_iops;
+	inode->i_fop = &simple_dir_operations;
+
+	inc_nlink(inode);
+	inc_nlink(dir);
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+
+	return 0;
+}
+
+static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, const struct inode_operations *iops)
+{
+	struct inode *inode;
+
+	if (bpf_dname_reserved(dentry))
+		return -EPERM;
+
+	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = iops;
+	inode->i_private = dentry->d_fsdata;
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+
+	return 0;
+}
+
+static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+		     dev_t devt)
+{
+	enum bpf_type type = MINOR(devt);
+
+	if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
+	    dentry->d_fsdata == NULL)
+		return -EPERM;
+
+	switch (type) {
+	case BPF_TYPE_PROG:
+		return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+	case BPF_TYPE_MAP:
+		return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
+	default:
+		return -EPERM;
+	}
+}
+
+static const struct inode_operations bpf_dir_iops = {
+	.lookup		= simple_lookup,
+	.mknod		= bpf_mkobj,
+	.mkdir		= bpf_mkdir,
+	.rmdir		= simple_rmdir,
+	.unlink		= simple_unlink,
+};
+
+static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+			  enum bpf_type type)
+{
+	struct dentry *dentry;
+	struct inode *dir;
+	struct path path;
+	umode_t mode;
+	dev_t devt;
+	int ret;
+
+	dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+	devt = MKDEV(UNNAMED_MAJOR, type);
+
+	ret = security_path_mknod(&path, dentry, mode, devt);
+	if (ret)
+		goto out;
+
+	dir = d_inode(path.dentry);
+	if (dir->i_op != &bpf_dir_iops) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	dentry->d_fsdata = raw;
+	ret = vfs_mknod(dir, dentry, mode, devt);
+	dentry->d_fsdata = NULL;
+out:
+	done_path_create(&path, dentry);
+	return ret;
+}
+
+int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+{
+	struct filename *pname;
+	enum bpf_type type;
+	void *raw;
+	int ret;
+
+	pname = getname(pathname);
+	if (IS_ERR(pname))
+		return PTR_ERR(pname);
+
+	raw = bpf_fd_probe_obj(ufd, &type);
+	if (IS_ERR(raw)) {
+		ret = PTR_ERR(raw);
+		goto out;
+	}
+
+	ret = bpf_obj_do_pin(pname, raw, type);
+	if (ret != 0)
+		bpf_any_put(raw, type);
+out:
+	putname(pname);
+	return ret;
+}
+
+static void *bpf_obj_do_get(const struct filename *pathname,
+			    enum bpf_type *type)
+{
+	struct inode *inode;
+	struct path path;
+	void *raw;
+	int ret;
+
+	ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	inode = d_backing_inode(path.dentry);
+	ret = inode_permission(inode, MAY_WRITE);
+	if (ret)
+		goto out;
+
+	ret = bpf_inode_type(inode, type);
+	if (ret)
+		goto out;
+
+	raw = bpf_any_get(inode->i_private, *type);
+	touch_atime(&path);
+
+	path_put(&path);
+	return raw;
+out:
+	path_put(&path);
+	return ERR_PTR(ret);
+}
+
+int bpf_obj_get_user(const char __user *pathname)
+{
+	enum bpf_type type = BPF_TYPE_UNSPEC;
+	struct filename *pname;
+	int ret = -ENOENT;
+	void *raw;
+
+	pname = getname(pathname);
+	if (IS_ERR(pname))
+		return PTR_ERR(pname);
+
+	raw = bpf_obj_do_get(pname, &type);
+	if (IS_ERR(raw)) {
+		ret = PTR_ERR(raw);
+		goto out;
+	}
+
+	if (type == BPF_TYPE_PROG)
+		ret = bpf_prog_new_fd(raw);
+	else if (type == BPF_TYPE_MAP)
+		ret = bpf_map_new_fd(raw);
+	else
+		goto out;
+
+	if (ret < 0)
+		bpf_any_put(raw, type);
+out:
+	putname(pname);
+	return ret;
+}
+
+static void bpf_evict_inode(struct inode *inode)
+{
+	enum bpf_type type;
+
+	truncate_inode_pages_final(&inode->i_data);
+	clear_inode(inode);
+
+	if (!bpf_inode_type(inode, &type))
+		bpf_any_put(inode->i_private, type);
+}
+
+static const struct super_operations bpf_super_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= bpf_evict_inode,
+};
+
+static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+{
+	static struct tree_descr bpf_rfiles[] = { { "" } };
+	struct inode *inode;
+	int ret;
+
+	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+	if (ret)
+		return ret;
+
+	sb->s_op = &bpf_super_ops;
+
+	inode = sb->s_root->d_inode;
+	inode->i_op = &bpf_dir_iops;
+	inode->i_mode &= ~S_IALLUGO;
+	inode->i_mode |= S_ISVTX | S_IRWXUGO;
+
+	return 0;
+}
+
+static struct dentry *bpf_mount(struct file_system_type *type, int flags,
+				const char *dev_name, void *data)
+{
+	return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+}
+
+static struct file_system_type bpf_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "bpf",
+	.mount		= bpf_mount,
+	.kill_sb	= kill_litter_super,
+	.fs_flags	= FS_USERNS_MOUNT,
+};
+
+MODULE_ALIAS_FS("bpf");
+
+static int __init bpf_init(void)
+{
+	int ret;
+
+	ret = sysfs_create_mount_point(fs_kobj, "bpf");
+	if (ret)
+		return ret;
+
+	ret = register_filesystem(&bpf_fs_type);
+	if (ret)
+		sysfs_remove_mount_point(fs_kobj, "bpf");
+
+	return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e..0d3313d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
 static LIST_HEAD(bpf_map_types);
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	atomic_long_add(map->pages, &user->locked_vm);
+
+	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+		atomic_long_sub(map->pages, &user->locked_vm);
+		free_uid(user);
+		return -EPERM;
+	}
+	map->user = user;
+	return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+	struct user_struct *user = map->user;
+
+	atomic_long_sub(map->pages, &user->locked_vm);
+	free_uid(user);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
+	bpf_map_uncharge_memlock(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = {
 	.release = bpf_map_release,
 };
 
+int bpf_map_new_fd(struct bpf_map *map)
+{
+	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+				O_RDWR | O_CLOEXEC);
+}
+
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
 #define CHECK_ATTR(CMD) \
 	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr)
 
 	atomic_set(&map->refcnt, 1);
 
-	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map;
 
+	err = bpf_map_new_fd(map);
 	if (err < 0)
 		/* failed to allocate fd */
 		goto free_map;
@@ -124,19 +162,29 @@ free_map:
 /* if error is returned, fd is released.
  * On success caller should complete fd access with matching fdput()
  */
-struct bpf_map *bpf_map_get(struct fd f)
+struct bpf_map *__bpf_map_get(struct fd f)
 {
-	struct bpf_map *map;
-
 	if (!f.file)
 		return ERR_PTR(-EBADF);
-
 	if (f.file->f_op != &bpf_map_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	map = f.file->private_data;
+	return f.file->private_data;
+}
+
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return map;
+
+	atomic_inc(&map->refcnt);
+	fdput(f);
 
 	return map;
 }
@@ -164,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		return -EINVAL;
 
 	f = fdget(ufd);
-	map = bpf_map_get(f);
+	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
@@ -223,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr)
 		return -EINVAL;
 
 	f = fdget(ufd);
-	map = bpf_map_get(f);
+	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
@@ -276,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr)
 		return -EINVAL;
 
 	f = fdget(ufd);
-	map = bpf_map_get(f);
+	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
@@ -317,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr)
 		return -EINVAL;
 
 	f = fdget(ufd);
-	map = bpf_map_get(f);
+	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
@@ -402,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_get_route_realm)
+				prog->dst_needed = 1;
+			if (insn->imm == BPF_FUNC_get_prandom_u32)
+				bpf_user_rnd_init_once();
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
@@ -436,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
-static void __prog_put_rcu(struct rcu_head *rcu)
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	atomic_long_add(prog->pages, &user->locked_vm);
+	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+		atomic_long_sub(prog->pages, &user->locked_vm);
+		free_uid(user);
+		return -EPERM;
+	}
+	prog->aux->user = user;
+	return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = prog->aux->user;
+
+	atomic_long_sub(prog->pages, &user->locked_vm);
+	free_uid(user);
+}
+
+static void __prog_put_common(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
 	free_used_maps(aux);
+	bpf_prog_uncharge_memlock(aux->prog);
 	bpf_prog_free(aux->prog);
 }
 
 /* version of bpf_prog_put() that is called after a grace period */
 void bpf_prog_put_rcu(struct bpf_prog *prog)
 {
-	if (atomic_dec_and_test(&prog->aux->refcnt)) {
-		prog->aux->prog = prog;
-		call_rcu(&prog->aux->rcu, __prog_put_rcu);
-	}
+	if (atomic_dec_and_test(&prog->aux->refcnt))
+		call_rcu(&prog->aux->rcu, __prog_put_common);
 }
 
 void bpf_prog_put(struct bpf_prog *prog)
 {
-	if (atomic_dec_and_test(&prog->aux->refcnt)) {
-		free_used_maps(prog->aux);
-		bpf_prog_free(prog);
-	}
+	if (atomic_dec_and_test(&prog->aux->refcnt))
+		__prog_put_common(&prog->aux->rcu);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -474,21 +548,22 @@ static const struct file_operations bpf_prog_fops = {
         .release = bpf_prog_release,
 };
 
-static struct bpf_prog *get_prog(struct fd f)
+int bpf_prog_new_fd(struct bpf_prog *prog)
 {
-	struct bpf_prog *prog;
+	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+				O_RDWR | O_CLOEXEC);
+}
 
+static struct bpf_prog *__bpf_prog_get(struct fd f)
+{
 	if (!f.file)
 		return ERR_PTR(-EBADF);
-
 	if (f.file->f_op != &bpf_prog_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	prog = f.file->private_data;
-
-	return prog;
+	return f.file->private_data;
 }
 
 /* called by sockets/tracing/seccomp before attaching program to an event
@@ -499,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
 
-	prog = get_prog(f);
-
+	prog = __bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
 
 	atomic_inc(&prog->aux->refcnt);
 	fdput(f);
+
 	return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get);
@@ -540,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr)
 	    attr->kern_version != LINUX_VERSION_CODE)
 		return -EINVAL;
 
+	if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
 		return -ENOMEM;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_nouncharge;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -553,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	prog->orig_prog = NULL;
-	prog->jited = false;
+	prog->jited = 0;
 
 	atomic_set(&prog->aux->refcnt, 1);
-	prog->gpl_compatible = is_gpl;
+	prog->gpl_compatible = is_gpl ? 1 : 0;
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
@@ -576,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_used_maps;
 
-	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+	err = bpf_prog_new_fd(prog);
 	if (err < 0)
 		/* failed to allocate fd */
 		goto free_used_maps;
@@ -586,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr)
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
+	bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
 }
 
+#define BPF_OBJ_LAST_FIELD bpf_fd
+
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_OBJ))
+		return -EINVAL;
+
+	return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+}
+
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+		return -EINVAL;
+
+	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
 	int err;
 
-	/* the syscall is limited to root temporarily. This restriction will be
-	 * lifted when security audit is clean. Note that eBPF+tracing must have
-	 * this restriction, since it may pass kernel data to user space
-	 */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
 		return -EPERM;
 
 	if (!access_ok(VERIFY_READ, uattr, 1))
@@ -654,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_LOAD:
 		err = bpf_prog_load(&attr);
 		break;
+	case BPF_OBJ_PIN:
+		err = bpf_obj_pin(&attr);
+		break;
+	case BPF_OBJ_GET:
+		err = bpf_obj_get(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23..c607305 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
 	struct verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
+	bool allow_ptr_leaks;
 };
 
 /* verbose verifier prints what it's seeing
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock);
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static void verbose(const char *fmt, ...)
+static __printf(1, 2) void verbose(const char *fmt, ...)
 {
 	va_list args;
 
@@ -244,6 +245,7 @@ static const struct {
 } func_limit[] = {
 	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
 		return -EINVAL;
 }
 
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+	switch (type) {
+	case PTR_TO_MAP_VALUE:
+	case PTR_TO_MAP_VALUE_OR_NULL:
+	case PTR_TO_STACK:
+	case PTR_TO_CTX:
+	case FRAME_PTR:
+	case CONST_PTR_TO_MAP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 	 */
 
 	if (value_regno >= 0 &&
-	    (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
-	     state->regs[value_regno].type == PTR_TO_STACK ||
-	     state->regs[value_regno].type == PTR_TO_CTX)) {
+	    is_spillable_regtype(state->regs[value_regno].type)) {
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 	return -EACCES;
 }
 
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+	if (env->allow_ptr_leaks)
+		return false;
+
+	switch (env->cur_state.regs[regno].type) {
+	case UNKNOWN_VALUE:
+	case CONST_IMM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	}
 
 	if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into map\n", value_regno);
+			return -EACCES;
+		}
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
 	} else if (state->regs[regno].type == PTR_TO_CTX) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into ctx\n", value_regno);
+			return -EACCES;
+		}
 		err = check_ctx_access(env, off, size, t);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
 		}
-		if (t == BPF_WRITE)
+		if (t == BPF_WRITE) {
+			if (!env->allow_ptr_leaks &&
+			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+			    size != BPF_REG_SIZE) {
+				verbose("attempt to corrupt spilled pointer on stack\n");
+				return -EACCES;
+			}
 			err = check_stack_write(state, off, size, value_regno);
-		else
+		} else {
 			err = check_stack_read(state, off, size, value_regno);
+		}
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
-	if (arg_type == ARG_ANYTHING)
+	if (arg_type == ARG_ANYTHING) {
+		if (is_pointer_value(env, regno)) {
+			verbose("R%d leaks addr into helper function\n", regno);
+			return -EACCES;
+		}
 		return 0;
+	}
 
 	if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		 * don't allow any other map type to be passed into
 		 * the special func;
 		 */
-		if (bool_map != bool_func)
+		if (bool_func && bool_map != bool_func)
 			return -EINVAL;
 	}
 
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id)
 }
 
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
+	struct reg_state *regs = env->cur_state.regs;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 		if (err)
 			return err;
 
+		if (is_pointer_value(env, insn->dst_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->dst_reg);
+			return -EACCES;
+		}
+
 		/* check dest operand */
 		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
 		if (err)
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 				 */
 				regs[insn->dst_reg] = regs[insn->src_reg];
 			} else {
+				if (is_pointer_value(env, insn->src_reg)) {
+					verbose("R%d partial copy of pointer\n",
+						insn->src_reg);
+					return -EACCES;
+				}
 				regs[insn->dst_reg].type = UNKNOWN_VALUE;
 				regs[insn->dst_reg].map_ptr = NULL;
 			}
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 		/* pattern match 'bpf_add Rx, imm' instruction */
 		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
 		    regs[insn->dst_reg].type == FRAME_PTR &&
-		    BPF_SRC(insn->code) == BPF_K)
+		    BPF_SRC(insn->code) == BPF_K) {
 			stack_relative = true;
+		} else if (is_pointer_value(env, insn->dst_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->dst_reg);
+			return -EACCES;
+		} else if (BPF_SRC(insn->code) == BPF_X &&
+			   is_pointer_value(env, insn->src_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->src_reg);
+			return -EACCES;
+		}
 
 		/* check dest operand */
 		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
 		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 		if (err)
 			return err;
+
+		if (is_pointer_value(env, insn->src_reg)) {
+			verbose("R%d pointer comparison prohibited\n",
+				insn->src_reg);
+			return -EACCES;
+		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
 			verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = 0;
 		}
+	} else if (is_pointer_value(env, insn->dst_reg)) {
+		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		return -EACCES;
 	} else if (BPF_SRC(insn->code) == BPF_K &&
 		   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
 
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env)
 		}
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
-			err = check_alu_op(regs, insn);
+			err = check_alu_op(env, insn);
 			if (err)
 				return err;
 
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env)
 				if (err)
 					return err;
 
+				if (is_pointer_value(env, BPF_REG_0)) {
+					verbose("R0 leaks addr as return value\n");
+					return -EACCES;
+				}
+
 process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
@@ -1902,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 			}
 
 			f = fdget(insn->imm);
-
-			map = bpf_map_get(f);
+			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
 				verbose("fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
@@ -2024,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
 
 		cnt = env->prog->aux->ops->
 			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf);
+					   insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
 	ret = do_check(env);
 
 skip_full_check:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109..39db20c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5375,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
 
 	if (sample_type & PERF_SAMPLE_RAW) {
 		if (data->raw) {
-			perf_output_put(handle, data->raw->size);
-			__output_copy(handle, data->raw->data,
-					   data->raw->size);
+			u32 raw_size = data->raw->size;
+			u32 real_size = round_up(raw_size + sizeof(u32),
+						 sizeof(u64)) - sizeof(u32);
+			u64 zero = 0;
+
+			perf_output_put(handle, real_size);
+			__output_copy(handle, data->raw->data, raw_size);
+			if (real_size - raw_size)
+				__output_copy(handle, &zero, real_size - raw_size);
 		} else {
 			struct {
 				u32	size;
@@ -5509,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 		else
 			size += sizeof(u32);
 
-		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header->size += size;
+		header->size += round_up(size, sizeof(u64));
 	}
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a71175f..0eebaee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1838,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
 	kfree(__free_percpu_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
 }
+EXPORT_SYMBOL_GPL(free_percpu_irq);
 
 /**
  *	setup_percpu_irq - setup a per-cpu interrupt
@@ -1867,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A percpu cookie passed back to the handler function
  *
- *	This call allocates interrupt resources, but doesn't
- *	automatically enable the interrupt. It has to be done on each
- *	CPU using enable_percpu_irq().
+ *	This call allocates interrupt resources and enables the
+ *	interrupt on the local CPU. If the interrupt is supposed to be
+ *	enabled on other CPUs, it has to be done on each CPU using
+ *	enable_percpu_irq().
  *
  *	Dev_id must be globally unique. It is a per-cpu variable, and
  *	the handler gets called with the interrupted CPU's instance of
@@ -1908,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 
 	return retval;
 }
+EXPORT_SYMBOL_GPL(request_percpu_irq);
 
 /**
  *	irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 787320d..b760bae 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
 		break;
 	}
 #endif
+
+	case PTRACE_SECCOMP_GET_FILTER:
+		ret = seccomp_get_filter(child, addr, datavp);
+		break;
+
 	default:
 		break;
 	}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779..580ac2d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *sfilter;
 	int ret;
+	const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 		return ERR_PTR(-ENOMEM);
 
 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
-					seccomp_check_filter);
+					seccomp_check_filter, save_orig);
 	if (ret < 0) {
 		kfree(sfilter);
 		return ERR_PTR(ret);
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
 {
 	if (filter) {
-		bpf_prog_free(filter->prog);
+		bpf_prog_destroy(filter->prog);
 		kfree(filter);
 	}
 }
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 	/* prctl interface doesn't have flags, so they are always zero. */
 	return do_seccomp(op, 0, uargs);
 }
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+			void __user *data)
+{
+	struct seccomp_filter *filter;
+	struct sock_fprog_kern *fprog;
+	long ret;
+	unsigned long count = 0;
+
+	if (!capable(CAP_SYS_ADMIN) ||
+	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+		return -EACCES;
+	}
+
+	spin_lock_irq(&task->sighand->siglock);
+	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	filter = task->seccomp.filter;
+	while (filter) {
+		filter = filter->prev;
+		count++;
+	}
+
+	if (filter_off >= count) {
+		ret = -ENOENT;
+		goto out;
+	}
+	count -= filter_off;
+
+	filter = task->seccomp.filter;
+	while (filter && count > 1) {
+		filter = filter->prev;
+		count--;
+	}
+
+	if (WARN_ON(count != 1 || !filter)) {
+		/* The filter tree shouldn't shrink while we're using it. */
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fprog = filter->prog->orig_prog;
+	if (!fprog) {
+		/* This must be a new non-cBPF filter, since we save every
+		 * every cBPF filter's orig_prog above when
+		 * CONFIG_CHECKPOINT_RESTORE is enabled.
+		 */
+		ret = -EMEDIUMTYPE;
+		goto out;
+	}
+
+	ret = fprog->len;
+	if (!data)
+		goto out;
+
+	get_seccomp_filter(task);
+	spin_unlock_irq(&task->sighand->siglock);
+
+	if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
+		ret = -EFAULT;
+
+	put_seccomp_filter(task);
+	return ret;
+
+out:
+	spin_unlock_irq(&task->sighand->siglock);
+	return ret;
+}
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d..96c856b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/bpf.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1139,6 +1140,18 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= timer_migration_handler,
 	},
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+	{
+		.procname	= "unprivileged_bpf_disabled",
+		.data		= &sysctl_unprivileged_bpf_disabled,
+		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7..4228fd3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	if (!event)
 		return -ENOENT;
 
+	/* make sure event is local and doesn't have pmu::count */
+	if (event->oncpu != smp_processor_id() ||
+	    event->pmu->count)
+		return -EINVAL;
+
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	return perf_event_read_local(event);
 }
 
-const struct bpf_func_proto bpf_perf_event_read_proto = {
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.func		= bpf_perf_event_read,
-	.gpl_only	= false,
+	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	struct pt_regs *regs = (struct pt_regs *) (long) r1;
+	struct bpf_map *map = (struct bpf_map *) (long) r2;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	void *data = (void *) (long) r4;
+	struct perf_sample_data sample_data;
+	struct perf_event *event;
+	struct perf_raw_record raw = {
+		.size = size,
+		.data = data,
+	};
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (unlikely(!event))
+		return -ENOENT;
+
+	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+		return -EINVAL;
+
+	if (unlikely(event->oncpu != smp_processor_id()))
+		return -EOPNOTSUPP;
+
+	perf_sample_data_init(&sample_data, 0, 0);
+	sample_data.raw = &raw;
+	perf_event_output(event, &sample_data, regs);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+	.func		= bpf_perf_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto;
 	default:
 		return NULL;
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-11-04 17:41:05 (GMT)
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-11-04 17:41:05 (GMT)
commit	b0f85fa11aefc4f3e03306b4cd47f113bd57dcba (patch)
tree	1333d36d99fde3f97210795941fc246f0ad08a75 /kernel
parent	ccc9d4a6d640cbde05d519edeb727881646cf71b (diff)
parent	f32bfb9a8ca083f8d148ea90ae5ba66f4831836e (diff)
download	linux-b0f85fa11aefc4f3e03306b4cd47f113bd57dcba.tar.xz