summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2013-11-01 21:17:16 (GMT)
committerScott Wood <scottwood@freescale.com>2013-11-03 22:47:10 (GMT)
commit31110de40dca4d4aeff4f253b3def948b88fa590 (patch)
tree0d811783836d52f15e37b4244de54f44ed4f93ad /kernel/events
parentae60d5d27c429b13cf28a09ab8b9d30682433c5a (diff)
parent8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 (diff)
downloadlinux-fsl-qoriq-31110de40dca4d4aeff4f253b3def948b88fa590.tar.xz
Merge tag 'v3.10' into sdk-kernel-3.10
git rebase --continue Linux 3.10 Conflicts: Documentation/virtual/kvm/api.txt arch/ia64/kvm/Makefile arch/powerpc/Kconfig arch/powerpc/Makefile arch/powerpc/boot/dts/b4420qds.dts arch/powerpc/boot/dts/b4860qds.dts arch/powerpc/boot/dts/b4qds.dts arch/powerpc/boot/dts/fsl/b4420si-post.dtsi arch/powerpc/boot/dts/fsl/b4420si-pre.dtsi arch/powerpc/boot/dts/fsl/b4860si-post.dtsi arch/powerpc/boot/dts/fsl/b4860si-pre.dtsi arch/powerpc/boot/dts/fsl/b4si-post.dtsi arch/powerpc/boot/dts/fsl/p1010si-post.dtsi arch/powerpc/boot/dts/fsl/p2041si-post.dtsi arch/powerpc/boot/dts/fsl/p3041si-post.dtsi arch/powerpc/boot/dts/fsl/p4080si-post.dtsi arch/powerpc/boot/dts/fsl/p5020si-post.dtsi arch/powerpc/boot/dts/fsl/p5040si-post.dtsi arch/powerpc/boot/dts/fsl/qonverge-usb2-dr-0.dtsi arch/powerpc/boot/dts/fsl/qoriq-sec5.0-0.dtsi arch/powerpc/boot/dts/fsl/t4240si-post.dtsi arch/powerpc/boot/dts/fsl/t4240si-pre.dtsi arch/powerpc/boot/dts/p1025rdb_36b.dts arch/powerpc/boot/dts/t4240qds.dts arch/powerpc/configs/corenet64_smp_defconfig arch/powerpc/configs/mpc85xx_defconfig arch/powerpc/configs/mpc85xx_smp_defconfig arch/powerpc/include/asm/cputable.h arch/powerpc/include/asm/kvm_host.h arch/powerpc/include/asm/kvm_ppc.h arch/powerpc/include/asm/machdep.h arch/powerpc/include/uapi/asm/kvm.h arch/powerpc/kernel/cpu_setup_fsl_booke.S arch/powerpc/kernel/cputable.c arch/powerpc/kernel/idle.c arch/powerpc/kernel/pci-common.c arch/powerpc/kvm/Kconfig arch/powerpc/kvm/book3s.c arch/powerpc/kvm/booke.c arch/powerpc/kvm/e500.c arch/powerpc/kvm/e500_mmu.c arch/powerpc/kvm/e500_mmu_host.c arch/powerpc/kvm/e500mc.c arch/powerpc/kvm/emulate.c arch/powerpc/kvm/irq.h arch/powerpc/kvm/mpic.c arch/powerpc/kvm/powerpc.c arch/powerpc/mm/tlb_nohash.c arch/powerpc/platforms/85xx/Kconfig arch/powerpc/platforms/85xx/b4_qds.c arch/powerpc/platforms/85xx/t4240_qds.c arch/powerpc/platforms/pseries/smp.c arch/powerpc/sysdev/fsl_85xx_l2ctlr.c arch/powerpc/sysdev/fsl_msi.c arch/powerpc/sysdev/fsl_pci.c arch/powerpc/sysdev/fsl_pci.h arch/powerpc/sysdev/mpic.c arch/x86/kvm/Makefile arch/x86/kvm/x86.c drivers/Kconfig drivers/clk/Kconfig drivers/cpufreq/Makefile drivers/crypto/caam/caamalg.c drivers/crypto/caam/intern.h drivers/crypto/caam/jr.c drivers/crypto/caam/regs.h drivers/infiniband/ulp/ipoib/ipoib_ethtool.c drivers/iommu/Makefile drivers/iommu/amd_iommu.c drivers/iommu/exynos-iommu.c drivers/iommu/intel-iommu.c drivers/iommu/iommu.c drivers/iommu/msm_iommu.c drivers/iommu/omap-iommu.c drivers/iommu/tegra-gart.c drivers/iommu/tegra-smmu.c drivers/misc/Makefile drivers/mmc/card/block.c drivers/mmc/card/queue.c drivers/mmc/core/core.c drivers/mtd/nand/fsl_ifc_nand.c drivers/net/ethernet/3com/3c501.c drivers/net/ethernet/8390/3c503.c drivers/net/ethernet/dec/ewrk3.c drivers/net/ethernet/freescale/fec.c drivers/net/ethernet/freescale/gianfar.c drivers/net/ethernet/freescale/gianfar.h drivers/net/ethernet/i825xx/3c505.c drivers/net/ethernet/i825xx/3c507.c drivers/rtc/rtc-ds3232.c drivers/s390/net/qeth_core_main.c drivers/staging/Kconfig drivers/staging/Makefile drivers/staging/ccg/u_ether.c drivers/usb/gadget/fsl_udc_core.c drivers/usb/otg/fsl_otg.c drivers/vfio/vfio.c drivers/watchdog/Kconfig include/linux/iommu.h include/linux/kvm_host.h include/linux/mmc/sdhci.h include/linux/msi.h include/linux/netdev_features.h include/linux/pci.h include/linux/skbuff.h include/net/ip6_route.h include/net/sch_generic.h include/net/xfrm.h include/uapi/linux/kvm.h net/core/netpoll.c virt/kvm/irqchip.c virt/kvm/kvm_main.c
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c573
-rw-r--r--kernel/events/hw_breakpoint.c8
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c36
-rw-r--r--kernel/events/uprobes.c746
5 files changed, 859 insertions, 510 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7080c71..575d18ca 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -37,6 +38,7 @@
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
+#include <linux/cgroup.h>
#include "internal.h"
@@ -194,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -234,6 +233,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
@@ -251,7 +264,22 @@ perf_cgroup_match(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- return !event->cgrp || event->cgrp == cpuctx->cgrp;
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
+
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
}
static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -657,8 +685,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
+ if (list_empty(&cpuctx->rotation_list)) {
+ int was_empty = list_empty(head);
list_add(&cpuctx->rotation_list, head);
+ if (was_empty)
+ tick_nohz_full_kick();
+ }
}
static void get_ctx(struct perf_event_context *ctx)
@@ -963,9 +995,15 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
event->header_size = size;
}
@@ -2557,6 +2595,16 @@ done:
list_del_init(&cpuctx->rotation_list);
}
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (list_empty(&__get_cpu_var(rotation_list)))
+ return true;
+ else
+ return false;
+}
+#endif
+
void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
@@ -2869,6 +2917,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
static void free_event(struct perf_event *event)
{
@@ -2893,15 +2942,30 @@ static void free_event(struct perf_event *event)
if (has_branch_stack(event)) {
static_key_slow_dec_deferred(&perf_sched_events);
/* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK))
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
atomic_dec(&per_cpu(perf_branch_stack_events,
event->cpu));
+ }
}
}
if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
+ struct ring_buffer *rb;
+
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ rb = event->rb;
+ if (rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* could be last */
+ }
+ mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
@@ -3139,30 +3203,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3473,16 +3520,12 @@ static void ring_buffer_attach(struct perf_event *event,
return;
spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
-
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
+ if (list_empty(&event->rb_entry))
+ list_add(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
}
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
{
unsigned long flags;
@@ -3501,13 +3544,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3536,18 +3576,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3557,26 +3589,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = event->rb;
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ return;
+
+ /* Detach current event from the buffer. */
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ mutex_unlock(&event->mmap_mutex);
+
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb); /* can't be last */
+ return;
+ }
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* can't be last, we still have one */
+ }
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3626,12 +3732,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3672,12 +3790,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+ rcu_assign_pointer(event->rb, rb);
perf_event_update_userpage(event);
@@ -3686,7 +3808,11 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -3694,7 +3820,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -4181,6 +4307,12 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4340,6 +4472,64 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
+typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_match_cb match,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ if (match(event, data))
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_match_cb match,
+ perf_event_aux_output_cb output,
+ void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, match, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, match, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
/*
* task tracking -- fork/exit
*
@@ -4362,8 +4552,9 @@ struct perf_task_event {
};
static void perf_event_task_output(struct perf_event *event,
- struct perf_task_event *task_event)
+ void *data)
{
+ struct perf_task_event *task_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
struct task_struct *task = task_event->task;
@@ -4391,59 +4582,11 @@ out:
task_event->event_id.header.size = size;
}
-static int perf_event_task_match(struct perf_event *event)
+static int perf_event_task_match(struct perf_event *event,
+ void *data __maybe_unused)
{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm || event->attr.mmap ||
- event->attr.mmap_data || event->attr.task)
- return 1;
-
- return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
- struct perf_task_event *task_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_task_match(event))
- perf_event_task_output(event, task_event);
- }
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
-{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
- struct pmu *pmu;
- int ctxn;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_task_ctx(&cpuctx->ctx, task_event);
-
- ctx = task_event->task_ctx;
- if (!ctx) {
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- }
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap_data || event->attr.task;
}
static void perf_event_task(struct task_struct *task,
@@ -4474,7 +4617,10 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_task_event(&task_event);
+ perf_event_aux(perf_event_task_match,
+ perf_event_task_output,
+ &task_event,
+ task_ctx);
}
void perf_event_fork(struct task_struct *task)
@@ -4500,8 +4646,9 @@ struct perf_comm_event {
};
static void perf_event_comm_output(struct perf_event *event,
- struct perf_comm_event *comm_event)
+ void *data)
{
+ struct perf_comm_event *comm_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
@@ -4528,39 +4675,16 @@ out:
comm_event->event_id.header.size = size;
}
-static int perf_event_comm_match(struct perf_event *event)
+static int perf_event_comm_match(struct perf_event *event,
+ void *data __maybe_unused)
{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm)
- return 1;
-
- return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
- struct perf_comm_event *comm_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_comm_match(event))
- perf_event_comm_output(event, comm_event);
- }
+ return event->attr.comm;
}
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
char comm[TASK_COMM_LEN];
unsigned int size;
- struct pmu *pmu;
- int ctxn;
memset(comm, 0, sizeof(comm));
strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4570,24 +4694,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_comm_ctx(ctx, comm_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_comm_match,
+ perf_event_comm_output,
+ comm_event,
+ NULL);
}
void perf_event_comm(struct task_struct *task)
@@ -4596,6 +4707,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
+ rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4603,6 +4715,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
+ rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4647,8 +4760,9 @@ struct perf_mmap_event {
};
static void perf_event_mmap_output(struct perf_event *event,
- struct perf_mmap_event *mmap_event)
+ void *data)
{
+ struct perf_mmap_event *mmap_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
@@ -4675,46 +4789,24 @@ out:
}
static int perf_event_mmap_match(struct perf_event *event,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if ((!executable && event->attr.mmap_data) ||
- (executable && event->attr.mmap))
- return 1;
-
- return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
- struct perf_mmap_event *mmap_event,
- int executable)
+ void *data)
{
- struct perf_event *event;
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_mmap_match(event, mmap_event, executable))
- perf_event_mmap_output(event, mmap_event);
- }
+ return (!executable && event->attr.mmap_data) ||
+ (executable && event->attr.mmap);
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
unsigned int size;
char tmp[16];
char *buf = NULL;
const char *name;
- struct pmu *pmu;
- int ctxn;
memset(tmp, 0, sizeof(tmp));
@@ -4737,7 +4829,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
} else {
if (arch_vma_name(mmap_event->vma)) {
name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
+ sizeof(tmp) - 1);
+ tmp[sizeof(tmp) - 1] = '\0';
goto got_name;
}
@@ -4764,29 +4857,15 @@ got_name:
mmap_event->file_name = name;
mmap_event->file_size = size;
- mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
-
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_event_mmap_ctx(ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
- }
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_mmap_match,
+ perf_event_mmap_output,
+ mmap_event,
+ NULL);
kfree(buf);
}
@@ -5130,7 +5209,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -5138,7 +5216,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -5332,7 +5410,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5423,7 +5501,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
@@ -5433,7 +5510,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -5653,6 +5730,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5969,13 +6047,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -5992,6 +6066,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
@@ -6177,11 +6252,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
- if (attr->type == PERF_TYPE_BREAKPOINT)
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}
@@ -6415,6 +6493,8 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
+ old_rb = event->rb;
+
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6422,16 +6502,28 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
if (old_rb)
ring_buffer_detach(event, old_rb);
+
+ if (rb)
+ ring_buffer_attach(event, rb);
+
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
@@ -7518,12 +7610,5 @@ struct cgroup_subsys perf_subsys = {
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
-
- /*
- * perf_event cgroup doesn't handle nesting correctly.
- * ctx->nr_cgroups adjustments should be propagated through the
- * cgroup hierarchy. Fix it and remove the following.
- */
- .broken_hierarchy = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916..20185ea 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.bp_target == tsk &&
find_slot_idx(iter) == type &&
- cpu == iter->cpu)
+ (iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
@@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
return;
}
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned int nr;
nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
if (cpu >= 0) {
toggle_bp_task_slot(bp, cpu, enable, type, weight);
} else {
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
toggle_bp_task_slot(bp, cpu, enable, type, weight);
}
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+ kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
if (err_cpu == cpu)
break;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c..ca65997 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
@@ -31,6 +31,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34f..cd55144 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
unsigned long offset, unsigned long head)
{
- unsigned long mask;
+ unsigned long sz = perf_data_size(rb);
+ unsigned long mask = sz - 1;
- if (!rb->writable)
+ /*
+ * check if user-writable
+ * overwrite : over-write its own tail
+ * !overwrite: buffer possibly drops events.
+ */
+ if (rb->overwrite)
return true;
- mask = perf_data_size(rb) - 1;
+ /*
+ * verify that payload is not bigger than buffer
+ * otherwise masking logic may fail to detect
+ * the "not enough space" condition
+ */
+ if ((head - offset) > sz)
+ return false;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acf..f356974 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
static struct rb_root uprobes_tree = RB_ROOT;
-
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
-
-#define UPROBES_HASH_SZ 13
-
/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
*/
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
static struct percpu_rw_semaphore dup_mmap_sem;
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time. Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER 1
/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 2
+#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
+ struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
- struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
@@ -101,6 +75,15 @@ struct uprobe {
struct arch_uprobe arch;
};
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -199,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
- memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
kunmap_atomic(kaddr);
}
@@ -211,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
uprobe_opcode_t old_opcode;
bool is_swbp;
- copy_opcode(page, vaddr, &old_opcode);
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
@@ -230,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
@@ -251,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
- void *vaddr_old, *vaddr_new;
struct vm_area_struct *vma;
int ret;
@@ -272,15 +284,8 @@ retry:
__SetPageUptodate(new_page);
- /* copy the page now that we've got it stable */
- vaddr_old = kmap_atomic(old_page);
- vaddr_new = kmap_atomic(new_page);
-
- memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
- memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-
- kunmap_atomic(vaddr_new);
- kunmap_atomic(vaddr_old);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = anon_vma_prepare(vma);
if (ret)
@@ -430,9 +435,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-
return u;
}
@@ -452,8 +454,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- mutex_init(&uprobe->copy_mutex);
+ /* For now assume that the instruction need not be single-stepped */
+ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +467,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
kfree(uprobe);
uprobe = cur_uprobe;
iput(inode);
- } else {
- atomic_inc(&uprobe_events);
}
return uprobe;
}
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
- struct uprobe_consumer *uc;
-
- if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
- return;
-
- down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (!uc->filter || uc->filter(uc, current))
- uc->handler(uc, regs);
- }
- up_read(&uprobe->consumer_rwsem);
-}
-
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
up_write(&uprobe->consumer_rwsem);
-
- return uc->next;
}
/*
@@ -525,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
unsigned long nbytes, loff_t offset)
{
struct page *page;
- void *vaddr;
- unsigned long off;
- pgoff_t idx;
-
- if (!filp)
- return -EINVAL;
if (!mapping->a_ops->readpage)
return -EIO;
-
- idx = offset >> PAGE_CACHE_SHIFT;
- off = offset & ~PAGE_MASK;
-
/*
* Ensure that the page that has the original instruction is
* populated and in page-cache.
*/
- page = read_mapping_page(mapping, idx, filp);
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
if (IS_ERR(page))
return PTR_ERR(page);
- vaddr = kmap_atomic(page);
- memcpy(insn, vaddr + off, nbytes);
- kunmap_atomic(vaddr);
+ copy_from_page(page, offset, insn, nbytes);
page_cache_release(page);
return 0;
@@ -588,7 +559,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
return ret;
- mutex_lock(&uprobe->copy_mutex);
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
goto out;
@@ -597,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
goto out;
ret = -ENOTSUPP;
- if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -612,7 +584,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
- mutex_unlock(&uprobe->copy_mutex);
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
return ret;
}
@@ -624,16 +619,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
bool first_uprobe;
int ret;
- /*
- * If probe is being deleted, unregister thread could be done with
- * the vma-rmap-walk through. Adding a probe now can be fatal since
- * nobody will be able to cleanup. Also we could be from fork or
- * mremap path, where the probe might have already been inserted.
- * Hence behave as if probe already existed.
- */
- if (!uprobe->consumers)
- return 0;
-
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
if (ret)
return ret;
@@ -658,14 +643,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- /* can happen if uprobe_register() fails */
- if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
- return 0;
-
set_bit(MMF_RECALC_UPROBES, &mm->flags);
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +658,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
iput(uprobe->inode);
put_uprobe(uprobe);
- atomic_dec(&uprobe_events);
}
struct map_info {
@@ -764,8 +752,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
return curr;
}
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
+ bool is_register = !!new;
struct map_info *info;
int err = 0;
@@ -787,17 +777,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
down_write(&mm->mmap_sem);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
- vma->vm_file->f_mapping->host != uprobe->inode)
+ file_inode(vma->vm_file) != uprobe->inode)
goto unlock;
if (vma->vm_start > info->vaddr ||
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
- if (is_register)
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- else
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
unlock:
up_write(&mm->mmap_sem);
@@ -810,17 +806,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
return err;
}
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- return register_for_each_vma(uprobe, true);
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
}
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- if (!register_for_each_vma(uprobe, false))
- delete_uprobe(uprobe);
+ int err;
+ if (!consumer_del(uprobe, uc)) /* WARN? */
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
}
/*
@@ -845,31 +847,63 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
- if (!inode || !uc || uc->next)
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
return -EINVAL;
+ /* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
- ret = 0;
- mutex_lock(uprobes_hash(inode));
+ retry:
uprobe = alloc_uprobe(inode, offset);
-
- if (!uprobe) {
- ret = -ENOMEM;
- } else if (!consumer_add(uprobe, uc)) {
- ret = __uprobe_register(uprobe);
- if (ret) {
- uprobe->consumers = NULL;
- __uprobe_unregister(uprobe);
- } else {
- set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
}
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (!uprobe)
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
return ret;
}
@@ -884,25 +918,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
{
struct uprobe *uprobe;
- if (!inode || !uc)
- return;
-
uprobe = find_uprobe(inode, offset);
if (!uprobe)
return;
- mutex_lock(uprobes_hash(inode));
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
- if (consumer_del(uprobe, uc)) {
- if (!uprobe->consumers) {
- __uprobe_unregister(uprobe);
- clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
}
+ up_read(&mm->mmap_sem);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ return err;
}
static struct rb_node *
@@ -979,18 +1030,23 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+ if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
if (!inode)
return 0;
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1008,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
struct inode *inode;
struct rb_node *n;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
@@ -1025,7 +1081,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+ if (no_uprobe_events() || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1098,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct xol_area *area)
{
- struct mm_struct *mm;
- int ret;
-
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
- return -ENOMEM;
-
- ret = -EALREADY;
- mm = current->mm;
+ struct mm_struct *mm = current->mm;
+ int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
ret = -ENOMEM;
-
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1121,59 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
ret = 0;
-
-fail:
+ fail:
up_write(&mm->mmap_sem);
- if (ret)
- __free_page(area->page);
return ret;
}
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
- struct xol_area *area;
-
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
-
- return area;
-}
-
/*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
*
* Returns the allocated area or NULL.
*/
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
{
+ struct mm_struct *mm = current->mm;
struct xol_area *area;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ area = mm->uprobes_state.xol_area;
+ if (area)
+ goto ret;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
- return NULL;
+ goto out;
area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-
if (!area->bitmap)
- goto fail;
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+ /* allocate first slot of task's xol_area for the return probes */
+ set_bit(0, area->bitmap);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ atomic_set(&area->slot_count, 1);
init_waitqueue_head(&area->wq);
+
if (!xol_add_vma(area))
return area;
-fail:
+ __free_page(area->page);
+ free_bitmap:
kfree(area->bitmap);
+ free_area:
kfree(area);
-
- return get_xol_area(current->mm);
+ out:
+ area = mm->uprobes_state.xol_area;
+ ret:
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
}
/*
@@ -1186,43 +1239,31 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
}
/*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
+ * xol_get_insn_slot - allocate a slot for xol.
* Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
- unsigned long offset;
- void *vaddr;
+ unsigned long xol_vaddr;
- area = get_xol_area(current->mm);
- if (!area) {
- area = xol_alloc_area();
- if (!area)
- return 0;
- }
- current->utask->xol_vaddr = xol_take_insn_slot(area);
+ area = get_xol_area();
+ if (!area)
+ return 0;
- /*
- * Initialize the slot if xol_vaddr points to valid
- * instruction slot.
- */
- if (unlikely(!current->utask->xol_vaddr))
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
return 0;
- current->utask->vaddr = slot_addr;
- offset = current->utask->xol_vaddr & ~PAGE_MASK;
- vaddr = kmap_atomic(area->page);
- memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
- kunmap_atomic(vaddr);
+ /* Initialize the slot */
+ copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
*/
flush_dcache_page(area->page);
- return current->utask->xol_vaddr;
+ return xol_vaddr;
}
/*
@@ -1240,8 +1281,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
return;
slot_addr = tsk->utask->xol_vaddr;
-
- if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+ if (unlikely(!slot_addr))
return;
area = tsk->mm->uprobes_state.xol_area;
@@ -1282,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
if (!utask)
return;
@@ -1289,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
if (utask->active_uprobe)
put_uprobe(utask->active_uprobe);
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
@@ -1303,33 +1353,135 @@ void uprobe_copy_process(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
*
* Returns:
* - pointer to new uprobe_task on success
* - NULL otherwise
*/
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
- utask = kzalloc(sizeof *utask, GFP_KERNEL);
- if (unlikely(!utask))
- return NULL;
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
- current->utask = utask;
- return utask;
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
- return 0;
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
- return -EFAULT;
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
}
/*
@@ -1391,6 +1543,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
* This is not strictly accurate, we can race with
* uprobe_unregister() and see the already removed
* uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
*/
if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
return;
@@ -1399,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
struct page *page;
uprobe_opcode_t opcode;
@@ -1417,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
- copy_opcode(page, vaddr, &opcode);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
- return is_swbp_insn(&opcode);
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1433,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
- struct inode *inode = vma->vm_file->f_mapping->host;
+ struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
}
if (!uprobe)
- *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
} else {
*is_swbp = -EFAULT;
}
@@ -1452,20 +1606,116 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
return uprobe;
}
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+
+ if (!chained)
+ break;
+
+ utask->depth--;
+
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
- struct uprobe_task *utask;
struct uprobe *uprobe;
unsigned long bp_vaddr;
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -1483,6 +1733,10 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
/*
* TODO: move copy_insn/etc into _register and remove this hack.
* After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1744,16 @@ static void handle_swbp(struct pt_regs *regs)
*/
smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
- goto restart;
-
- utask = current->utask;
- if (!utask) {
- utask = add_utask();
- /* Cannot allocate; re-execute the instruction. */
- if (!utask)
- goto restart;
- }
+ goto out;
handler_chain(uprobe, regs);
if (can_skip_sstep(uprobe, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr)) {
- utask->active_uprobe = uprobe;
- utask->state = UTASK_SSTEP;
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- }
-restart:
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
+ /* can_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1576,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
- if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
return 0;
set_thread_flag(TIF_UPROBE);
@@ -1609,10 +1851,8 @@ static int __init init_uprobes(void)
{
int i;
- for (i = 0; i < UPROBES_HASH_SZ; i++) {
- mutex_init(&uprobes_mutex[i]);
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- }
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;