summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/arraymap.c113
-rw-r--r--kernel/bpf/core.c105
-rw-r--r--kernel/bpf/helpers.c105
-rw-r--r--kernel/bpf/syscall.c42
-rw-r--r--kernel/bpf/verifier.c54
5 files changed, 377 insertions, 42 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a66165..cb31229 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
-
-struct bpf_array {
- struct bpf_map map;
- u32 elem_size;
- char value[0] __aligned(8);
-};
+#include <linux/filter.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
return 0;
}
late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+ /* only bpf_prog file descriptors can be stored in prog_array map */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* make sure it's empty */
+ for (i = 0; i < array->map.max_entries; i++)
+ BUG_ON(array->prog[i] != NULL);
+ kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog, *old_prog;
+ u32 index = *(u32 *)key, ufd;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ ufd = *(u32 *)value;
+ prog = bpf_prog_get(ufd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ old_prog = xchg(array->prog + index, prog);
+ if (old_prog)
+ bpf_prog_put_rcu(old_prog);
+
+ return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *old_prog;
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ old_prog = xchg(array->prog + index, NULL);
+ if (old_prog) {
+ bpf_prog_put_rcu(old_prog);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+ .map_alloc = prog_array_map_alloc,
+ .map_free = prog_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = prog_array_map_lookup_elem,
+ .map_update_elem = prog_array_map_update_elem,
+ .map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+ .ops = &prog_array_ops,
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+ bpf_register_map_type(&prog_array_type);
+ return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7f..c5bedc8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -26,9 +26,10 @@
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/moduleloader.h>
-#include <asm/unaligned.h>
#include <linux/bpf.h>
+#include <asm/unaligned.h>
+
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
#define BPF_R1 regs[BPF_REG_1]
@@ -62,6 +63,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
else if (k >= SKF_LL_OFF)
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -244,6 +246,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +289,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
+ u32 tail_call_cnt = 0;
void *ptr;
int off;
@@ -431,6 +435,30 @@ select_insn:
BPF_R4, BPF_R5);
CONT;
+ JMP_TAIL_CALL: {
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog;
+ u64 index = BPF_R3;
+
+ if (unlikely(index >= array->map.max_entries))
+ goto out;
+
+ if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+ goto out;
+
+ tail_call_cnt++;
+
+ prog = READ_ONCE(array->prog[index]);
+ if (unlikely(!prog))
+ goto out;
+
+ ARG1 = BPF_R1;
+ insn = prog->insnsi;
+ goto select_insn;
+out:
+ CONT;
+ }
/* JMP */
JMP_JA:
insn += insn->off;
@@ -615,25 +643,63 @@ load_byte:
return 0;
}
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+bool bpf_prog_array_compatible(struct bpf_array *array,
+ const struct bpf_prog *fp)
{
+ if (!array->owner_prog_type) {
+ /* There's no owner yet where we could check for
+ * compatibility.
+ */
+ array->owner_prog_type = fp->type;
+ array->owner_jited = fp->jited;
+
+ return true;
+ }
+
+ return array->owner_prog_type == fp->type &&
+ array->owner_jited == fp->jited;
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ struct bpf_map *map = aux->used_maps[i];
+ struct bpf_array *array;
+
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ continue;
+
+ array = container_of(map, struct bpf_array, map);
+ if (!bpf_prog_array_compatible(array, fp))
+ return -EINVAL;
+ }
+
+ return 0;
}
/**
- * bpf_prog_select_runtime - select execution runtime for BPF program
+ * bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
*
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via BPF_PROG_RUN() macro
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via BPF_PROG_RUN() macro.
*/
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run;
- /* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
- /* Lock whole bpf_prog as read-only */
bpf_prog_lock_ro(fp);
+
+ /* The tail call compatibility check can only be done at
+ * this late stage as we need to determine, if we deal
+ * with JITed or non JITed program concatenations and not
+ * all eBPF JITs might immediately support all features.
+ */
+ return bpf_check_tail_call(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
@@ -663,6 +729,29 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+ return NULL;
+}
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+ .func = NULL,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f598..1447ec0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,9 @@
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -44,11 +47,11 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
- .func = bpf_map_lookup_elem,
- .gpl_only = false,
- .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
};
static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -63,13 +66,13 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_update_elem_proto = {
- .func = bpf_map_update_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
- .arg3_type = ARG_PTR_TO_MAP_VALUE,
- .arg4_type = ARG_ANYTHING,
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
};
static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -83,11 +86,11 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
const struct bpf_func_proto bpf_map_delete_elem_proto = {
- .func = bpf_map_delete_elem,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
};
static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
@@ -111,3 +114,71 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+
+ if (!task)
+ return -EINVAL;
+
+ return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+ .func = bpf_get_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (!task)
+ return -EINVAL;
+
+ current_uid_gid(&uid, &gid);
+ return (u64) from_kgid(&init_user_ns, gid) << 32 |
+ from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+ .func = bpf_get_current_uid_gid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ char *buf = (char *) (long) r1;
+
+ if (!task)
+ return -EINVAL;
+
+ memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+ .func = bpf_get_current_comm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c5..a1b14d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
{
struct bpf_map *map = filp->private_data;
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+ /* prog_array stores refcnt-ed bpf_prog pointers
+ * release them all when user space closes prog_array_fd
+ */
+ bpf_prog_array_map_clear(map);
+
bpf_map_put(map);
return 0;
}
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode
+ * to avoid conditional branch in
+ * interpeter for every normal call
+ * and to prevent accidental JITing by
+ * JIT compiler that doesn't support
+ * bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+ continue;
+ }
+
fn = prog->aux->ops->get_func_proto(insn->imm);
/* all functions that have prototype and verifier allowed
* programs to call them, must be real in-kernel functions
@@ -413,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+static void __prog_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+ free_used_maps(aux);
+ bpf_prog_free(aux->prog);
+}
+
+/* version of bpf_prog_put() that is called after a grace period */
+void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ prog->aux->prog = prog;
+ call_rcu(&prog->aux->rcu, __prog_put_rcu);
+ }
+}
+
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
@@ -426,7 +462,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put(prog);
+ bpf_prog_put_rcu(prog);
return 0;
}
@@ -532,7 +568,9 @@ static int bpf_prog_load(union bpf_attr *attr)
fixup_bpf_calls(prog);
/* eBPF program is ready to be JITed */
- bpf_prog_select_runtime(prog);
+ err = bpf_prog_select_runtime(prog);
+ if (err < 0)
+ goto free_used_maps;
err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3a..039d866 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
fn->ret_type, func_id);
return -EINVAL;
}
+
+ if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+ func_id != BPF_FUNC_tail_call)
+ /* prog_array map type needs extra care:
+ * only allow to pass it into bpf_tail_call() for now.
+ * bpf_map_delete_elem() can be allowed in the future,
+ * while bpf_map_update_elem() must only be done via syscall
+ */
+ return -EINVAL;
+
+ if (func_id == BPF_FUNC_tail_call &&
+ map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ /* don't allow any other map type to be passed into
+ * bpf_tail_call()
+ */
+ return -EINVAL;
+
return 0;
}
@@ -1675,6 +1692,8 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
if (err)
@@ -1683,11 +1702,6 @@ static int do_check(struct verifier_env *env)
continue;
}
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0) {
- verbose("BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
/* check src1 operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
@@ -1697,6 +1711,8 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ dst_reg_type = regs[insn->dst_reg].type;
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -1704,6 +1720,15 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (insn->imm == 0) {
+ insn->imm = dst_reg_type;
+ } else if (dst_reg_type != insn->imm &&
+ (dst_reg_type == PTR_TO_CTX ||
+ insn->imm == PTR_TO_CTX)) {
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
@@ -1822,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
- (BPF_MODE(insn->code) != BPF_MEM ||
- insn->imm != 0)) {
+ (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
verbose("BPF_LDX uses reserved fields\n");
return -EINVAL;
}
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ ((BPF_MODE(insn->code) != BPF_MEM &&
+ BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_map *map;
struct fd f;
@@ -1950,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env)
struct bpf_prog *new_prog;
u32 cnt;
int i;
+ enum bpf_access_type type;
if (!env->prog->aux->ops->convert_ctx_access)
return 0;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ type = BPF_READ;
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ type = BPF_WRITE;
+ else
continue;
if (insn->imm != PTR_TO_CTX) {
@@ -1965,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
}
cnt = env->prog->aux->ops->
- convert_ctx_access(insn->dst_reg, insn->src_reg,
+ convert_ctx_access(type, insn->dst_reg, insn->src_reg,
insn->off, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");