From bbc33d05930f870ea049eae5ed980f8b827d0813 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 16:55:38 +0100 Subject: uprobes: Move __set_bit(UPROBE_SKIP_SSTEP) into alloc_uprobe() Cosmetic. __set_bit(UPROBE_SKIP_SSTEP) is the part of initialization, it is not clear why it is set in insert_uprobe(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 30ea9a4..afbab2c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -430,9 +430,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock(&uprobes_treelock); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - return u; } @@ -454,6 +451,8 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); + /* For now assume that the instruction need not be single-stepped */ + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); -- cgit v0.10.2 From f0744af7d0fde190674064c54e2ff60b34ac71fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 18:01:43 +0100 Subject: uprobes: Kill the pointless inode/uc checks in register/unregister register/unregister verifies that inode/uc != NULL. For what? This really looks like "hide the potential problem", the caller should pass the valid data. register() also checks uc->next == NULL, probably to prevent the double-register but the caller can do other stupid/wrong things. If we do this check, then we should document that uc->next should be cleared before register() and add BUG_ON(). Also add the small comment about the i_size_read() check. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index afbab2c..a39d816 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -844,9 +844,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; - if (!inode || !uc || uc->next) - return -EINVAL; - + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -883,9 +881,6 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume { struct uprobe *uprobe; - if (!inode || !uc) - return; - uprobe = find_uprobe(inode, offset); if (!uprobe) return; -- cgit v0.10.2 From fe20d71f25400cccc8bffef865f79250be7dbc81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 17:32:30 +0100 Subject: uprobes: Kill uprobe_consumer->filter() uprobe_consumer->filter() is pointless in its current form, kill it. We will add it back, but with the different signature/semantics. Perhaps we will even re-introduce the callsite in handler_chain(), but not to just skip uc->handler(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f628a6..83742b9 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -37,11 +37,6 @@ struct inode; struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); - /* - * filter is optional; If a filter exists, handler is run - * if and only if filter returns true. - */ - bool (*filter)(struct uprobe_consumer *self, struct task_struct *task); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a39d816..5cbebac 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -477,10 +477,8 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) return; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } + for (uc = uprobe->consumers; uc; uc = uc->next) + uc->handler(uc, regs); up_read(&uprobe->consumer_rwsem); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 87b6db4..e668024 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -550,7 +550,6 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { kfree(utc); -- cgit v0.10.2 From 63633cbf82840d972248f11d2122b261d0d4779a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 22 Nov 2012 18:30:15 +0100 Subject: uprobes: Introduce filter_chain() Add the new helper filter_chain(). Currently it is only placeholder, the comment explains what is should do. We will change it later to consult every consumer to decide whether we need to install the swbp. Until then it works as if any consumer returns true, this matches the current behavior. Change install_breakpoint() to call filter_chain() instead of checking uprobe->consumers != NULL. We obviously need this, and this equally closes the race with _unregister(). Change remove_breakpoint() to call this helper too. Currently this is pointless because remove_breakpoint() is only called when the last consumer goes away, but we will change this. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5cbebac..c38bf37 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,6 +614,18 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static bool filter_chain(struct uprobe *uprobe) +{ + /* + * TODO: + * for_each_consumer(uc) + * if (uc->filter(...)) + * return true; + * return false; + */ + return uprobe->consumers != NULL; +} + static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) @@ -624,11 +636,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, /* * If probe is being deleted, unregister thread could be done with * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. Also we could be from fork or - * mremap path, where the probe might have already been inserted. - * Hence behave as if probe already existed. + * nobody will be able to cleanup. But in this case filter_chain() + * must return false, all consumers have gone away. */ - if (!uprobe->consumers) + if (!filter_chain(uprobe)) return 0; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); @@ -655,10 +666,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - /* can happen if uprobe_register() fails */ if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) return 0; + if (filter_chain(uprobe)) + return 0; + set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1382,6 +1395,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. + * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; -- cgit v0.10.2 From 04aab9b2006bbdeff78dc162f206fdfebeca97d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 19:43:50 +0100 Subject: uprobes: _unregister() should always do register_for_each_vma(false) uprobe_unregister() removes the breakpoints only if the last consumer goes away. To support the filtering it should do this every time, we want to remove the breakpoints which nobody else want to keep. Note: given that filter_chain() is not actually implemented, this patch itself doesn't change the behaviour yet, register_for_each_vma(false) is a heavy "nop" unless there are no more consumers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c38bf37..9401990 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -825,12 +825,20 @@ static int __uprobe_register(struct uprobe *uprobe) return register_for_each_vma(uprobe, true); } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - if (!register_for_each_vma(uprobe, false)) - delete_uprobe(uprobe); + int err; + + if (!consumer_del(uprobe, uc)) /* WARN? */ + return; - /* TODO : cant unregister? schedule a worker thread */ + err = register_for_each_vma(uprobe, false); + if (!uprobe->consumers) { + clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + /* TODO : cant unregister? schedule a worker thread */ + if (!err) + delete_uprobe(uprobe); + } } /* @@ -868,8 +876,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } else if (!consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { - uprobe->consumers = NULL; - __uprobe_unregister(uprobe); + __uprobe_unregister(uprobe, uc); } else { set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); } @@ -897,14 +904,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); - - if (consumer_del(uprobe, uc)) { - if (!uprobe->consumers) { - __uprobe_unregister(uprobe); - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } - } - + __uprobe_unregister(uprobe, uc); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v0.10.2 From 9a98e03cc145c994da824dac7602334f50feb670 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Nov 2012 20:15:17 +0100 Subject: uprobes: _register() should always do register_for_each_vma(true) To support the filtering uprobe_register() should do register_for_each_vma(true) every time the new consumer comes, we need to install the previously nacked breakpoints. Note: - uprobes_mutex[] should die, what it actually protects is alloc_uprobe(). - UPROBE_RUN_HANDLER should die too, obviously it can't work unless uprobe has a single consumer. The consumer should serialize with _register/_unregister itself. Or this flag should live in uprobe_consumer->state. - Perhaps we can do some optimizations later. For example, if filter_chain() never returns false uprobe can record this fact and avoid the unnecessary register_for_each_vma(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9401990..d1d1394 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -482,16 +482,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->consumer_rwsem); } -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); uc->next = uprobe->consumers; uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - - return uc->next; } /* @@ -820,9 +816,15 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) return err; } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - return register_for_each_vma(uprobe, true); + int err; + + consumer_add(uprobe, uc); + err = register_for_each_vma(uprobe, true); + if (!err) /* TODO: pointless unless the first consumer */ + set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); + return err; } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -867,21 +869,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = 0; + ret = -ENOMEM; mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - - if (!uprobe) { - ret = -ENOMEM; - } else if (!consumer_add(uprobe, uc)) { - ret = __uprobe_register(uprobe); - if (ret) { + if (uprobe) { + ret = __uprobe_register(uprobe, uc); + if (ret) __uprobe_unregister(uprobe, uc); - } else { - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } } - mutex_unlock(uprobes_hash(inode)); if (uprobe) put_uprobe(uprobe); -- cgit v0.10.2 From e591c8d78e49e6206935cf31c4d2b603bbb29166 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 17:29:40 +0100 Subject: uprobes: Introduce uprobe->register_rwsem Introduce uprobe->register_rwsem. It is taken for writing around __uprobe_register/unregister. Change handler_chain() to use this sem rather than consumer_rwsem. The main reason for this change is that we have the nasty problem with mmap_sem/consumer_rwsem dependency. filter_chain() needs to protect uprobe->consumers like handler_chain(), but they can not use the same lock. filter_chain() can be called under ->mmap_sem (currently this is always true), but we want to allow ->handler() to play with the probed task's memory, and this needs ->mmap_sem. Alternatively we could use srcu, but synchronize_srcu() is very slow and ->register_rwsem allows us to do more. In particular, we can teach handler_chain() to do remove_breakpoint() if this bp is "nacked" by all consumers, we know that we can't race with the new consumer which does uprobe_register(). See also the next patches. uprobes_mutex[] is almost ready to die. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1d1394..61d0fa6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,6 +91,7 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; + struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; @@ -449,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; + init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ @@ -476,10 +478,10 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) return; - down_read(&uprobe->consumer_rwsem); + down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); - up_read(&uprobe->consumer_rwsem); + up_read(&uprobe->register_rwsem); } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -873,9 +875,11 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); if (uprobe) { + down_write(&uprobe->register_rwsem); ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); } mutex_unlock(uprobes_hash(inode)); if (uprobe) @@ -899,7 +903,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume return; mutex_lock(uprobes_hash(inode)); + down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } -- cgit v0.10.2 From 1ff6fee5e62c57d5923b805bb4206acb7953f16e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:15:46 +0100 Subject: uprobes: Change filter_chain() to iterate ->consumers list Now that it safe to use ->consumer_rwsem under ->mmap_sem we can almost finish the implementation of filter_chain(). It still lacks the actual uc->filter(...) call but othewrwise it is ready, just it pretends that ->filter() always returns true. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 61d0fa6..4d04523 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -614,14 +614,19 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, static bool filter_chain(struct uprobe *uprobe) { - /* - * TODO: - * for_each_consumer(uc) - * if (uc->filter(...)) - * return true; - * return false; - */ - return uprobe->consumers != NULL; + struct uprobe_consumer *uc; + bool ret = false; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + /* TODO: ret = uc->filter(...) */ + ret = true; + if (ret) + break; + } + up_read(&uprobe->consumer_rwsem); + + return ret; } static int -- cgit v0.10.2 From bb929284be40cbbdb347690742557d708fd504a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:27:08 +0100 Subject: uprobes: Kill UPROBE_RUN_HANDLER flag Simply remove UPROBE_RUN_HANDLER and the corresponding code. It can only help if uprobe has a single consumer, and in fact it is no longer needed after handler_chain() was changed to use ->register_rwsem, we simply can not race with uprobe_register(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4d04523..7ec2eb2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -83,10 +83,8 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 2 +#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -475,9 +473,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) - return; - down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) uc->handler(uc, regs); @@ -825,13 +820,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - int err; - consumer_add(uprobe, uc); - err = register_for_each_vma(uprobe, true); - if (!err) /* TODO: pointless unless the first consumer */ - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - return err; + return register_for_each_vma(uprobe, true); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -842,12 +832,9 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u return; err = register_for_each_vma(uprobe, false); - if (!uprobe->consumers) { - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - /* TODO : cant unregister? schedule a worker thread */ - if (!err) - delete_uprobe(uprobe); - } + /* TODO : cant unregister? schedule a worker thread */ + if (!uprobe->consumers && !err) + delete_uprobe(uprobe); } /* -- cgit v0.10.2 From d4d3ccc6d1eb74bd315d49a3829c5ad6c48d21b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 24 Nov 2012 18:51:34 +0100 Subject: uprobes: Kill uprobe->copy_mutex Now that ->register_rwsem is safe under ->mmap_sem we can kill ->copy_mutex and abuse down_write(&uprobe->consumer_rwsem). This makes prepare_uprobe() even more ugly, but we should kill it anyway. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ec2eb2..40ced98 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -91,7 +91,6 @@ struct uprobe { atomic_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; - struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -450,7 +449,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - mutex_init(&uprobe->copy_mutex); /* For now assume that the instruction need not be single-stepped */ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); @@ -578,7 +576,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; - mutex_lock(&uprobe->copy_mutex); + /* TODO: move this into _register, until then we abuse this sem. */ + down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; @@ -602,7 +601,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: - mutex_unlock(&uprobe->copy_mutex); + up_write(&uprobe->consumer_rwsem); return ret; } -- cgit v0.10.2 From 441f1eb7db8babe2b6b4bc805f023739dbb70e33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 19:54:29 +0100 Subject: uprobes: Kill uprobe_events, use RB_EMPTY_ROOT() instead uprobe_events counts the number of uprobes in uprobes_tree but it is used as a boolean. We can use RB_EMPTY_ROOT() instead. Probably no_uprobe_events() added by this patch can have more callers, say, mmf_recalc_uprobes(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 40ced98..5d38b40 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -41,6 +41,11 @@ #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; +/* + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time. Probably a fine grained per inode count is better? + */ +#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -74,13 +79,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time. Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); - /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 /* Can skip singlestep */ @@ -460,8 +458,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else { - atomic_inc(&uprobe_events); } return uprobe; @@ -685,7 +681,6 @@ static void delete_uprobe(struct uprobe *uprobe) spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); - atomic_dec(&uprobe_events); } struct map_info { @@ -975,7 +970,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + if (no_uprobe_events() || !valid_vma(vma, true)) return 0; inode = vma->vm_file->f_mapping->host; @@ -1021,7 +1016,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ -- cgit v0.10.2 From 06b7bcd8cbd7eb1af331e437ec3d8f5182ae1b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:01:42 +0100 Subject: uprobes: Introduce uprobe_is_active() The lifetime of uprobe->rb_node and uprobe->inode is not refcounted, delete_uprobe() is called when we detect that uprobe has no consumers, and it would be deadly wrong to do this twice. Change delete_uprobe() to WARN() if it was already called. We use RB_CLEAR_NODE() to mark uprobe "inactive", then RB_EMPTY_NODE() can be used to detect this case. RB_EMPTY_NODE() is not used directly, we add the trivial helper for the next change. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d38b40..358badd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -669,6 +669,10 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} /* * There could be threads that have already hit the breakpoint. They * will recheck the current insn and restart if find_uprobe() fails. @@ -676,9 +680,13 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { + if (WARN_ON(!uprobe_is_active(uprobe))) + return; + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); + RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ iput(uprobe->inode); put_uprobe(uprobe); } -- cgit v0.10.2 From 66d06dffa5ef6f3544997440af63a91ef36a2171 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Nov 2012 22:48:37 +0100 Subject: uprobes: Kill uprobes_mutex[], separate alloc_uprobe() and __uprobe_register() uprobe_register() and uprobe_unregister() are the only users of mutex_lock(uprobes_hash(inode)), and the only reason why we can't simply remove it is that we need to ensure that delete_uprobe() is not possible after alloc_uprobe() and before consumer_add(). IOW, we need to ensure that when we take uprobe->register_rwsem this uprobe is still valid and we didn't race with _unregister() which called delete_uprobe() in between. With this patch uprobe_register() simply checks uprobe_is_active() and retries if it hits this very unlikely race. uprobes_mutex[] is no longer needed and can be removed. There is another reason for this change, prepare_uprobe() should be folded into alloc_uprobe() and we do not want to hold the extra locks around read_mapping_page/etc. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 358badd..c3b65d1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -50,29 +50,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 - -/* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. - */ - -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) - /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) @@ -865,20 +842,26 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (offset > i_size_read(inode)) return -EINVAL; - ret = -ENOMEM; - mutex_lock(uprobes_hash(inode)); + retry: uprobe = alloc_uprobe(inode, offset); - if (uprobe) { - down_write(&uprobe->register_rwsem); + if (!uprobe) + return -ENOMEM; + /* + * We can race with uprobe_unregister()->delete_uprobe(). + * Check uprobe_is_active() and retry if it is false. + */ + down_write(&uprobe->register_rwsem); + ret = -EAGAIN; + if (likely(uprobe_is_active(uprobe))) { ret = __uprobe_register(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); } - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } @@ -896,11 +879,9 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (!uprobe) return; - mutex_lock(uprobes_hash(inode)); down_write(&uprobe->register_rwsem); __uprobe_unregister(uprobe, uc); up_write(&uprobe->register_rwsem); - mutex_unlock(uprobes_hash(inode)); put_uprobe(uprobe); } @@ -1609,10 +1590,8 @@ static int __init init_uprobes(void) { int i; - for (i = 0; i < UPROBES_HASH_SZ; i++) { - mutex_init(&uprobes_mutex[i]); + for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - } if (percpu_init_rwsem(&dup_mmap_sem)) return -ENOMEM; -- cgit v0.10.2 From 806a98bdf2a862fef0fc880399d677b35ba525ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 27 Dec 2012 18:21:11 +0100 Subject: uprobes: Rationalize the usage of filter_chain() filter_chain() was added into install_breakpoint/remove_breakpoint to simplify the initial changes but this is sub-optimal. This patch shifts the callsite to the callers, register_for_each_vma() and uprobe_mmap(). This way: - It will be easier to add the new arguments. This is the main reason, we can do more optimizations later. - register_for_each_vma(is_register => true) can be optimized, we only need to consult the new consumer. The previous consumers were already asked when they called uprobe_register(). This patch also moves the MMF_HAS_UPROBES check from remove_breakpoint(), this allows to avoid the potentionally costly filter_chain(). Note that register_for_each_vma(is_register => false) doesn't really need to take ->consumer_rwsem, but I don't think it makes sense to optimize this and introduce filter_chain_lockless(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c3b65d1..3391208 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,6 +579,11 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } +static inline bool consumer_filter(struct uprobe_consumer *uc) +{ + return true; /* TODO: !uc->filter || uc->filter(...) */ +} + static bool filter_chain(struct uprobe *uprobe) { struct uprobe_consumer *uc; @@ -586,8 +591,7 @@ static bool filter_chain(struct uprobe *uprobe) down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - /* TODO: ret = uc->filter(...) */ - ret = true; + ret = consumer_filter(uc); if (ret) break; } @@ -603,15 +607,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, bool first_uprobe; int ret; - /* - * If probe is being deleted, unregister thread could be done with - * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. But in this case filter_chain() - * must return false, all consumers have gone away. - */ - if (!filter_chain(uprobe)) - return 0; - ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; @@ -636,12 +631,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return 0; - - if (filter_chain(uprobe)) - return 0; - set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -781,10 +770,14 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - else - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (is_register) { + /* consult only the "caller", new consumer. */ + if (consumer_filter(uprobe->consumers)) + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + if (!filter_chain(uprobe)) + err |= remove_breakpoint(uprobe, mm, info->vaddr); + } unlock: up_write(&mm->mmap_sem); @@ -968,9 +961,14 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - + /* + * We can race with uprobe_unregister(), this uprobe can be already + * removed. But in this case filter_chain() must return false, all + * consumers have gone away. + */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + filter_chain(uprobe)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v0.10.2 From 8a7f2fa0dea3b019500961b86d765e6fdd4bffb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 28 Dec 2012 17:58:38 +0100 Subject: uprobes: Reintroduce uprobe_consumer->filter() Finally add uprobe_consumer->filter() and change consumer_filter() to actually call this method. Note that ->filter() accepts mm_struct, not task_struct. Because: 1. We do not have for_each_mm_user(mm, task). 2. Even if we implement for_each_mm_user(), ->filter() can use it itself. 3. It is not clear who will actually need this interface to do the "nontrivial" filtering. Another argument is "enum uprobe_filter_ctx", consumer->filter() can use it to figure out why/where it was called. For example, perhaps we can add UPROBE_FILTER_PRE_REGISTER used by build_map_info() to quickly "nack" the unwanted mm's. In this case consumer should know that it is called under ->i_mmap_mutex. See the previous discussion at http://marc.info/?t=135214229700002 Perhaps we should pass more arguments, vma/vaddr? Note: this patch obviously can't help to filter out the child created by fork(), this will be addressed later. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 83742b9..c2df693 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -35,8 +35,17 @@ struct inode; # include #endif +enum uprobe_filter_ctx { + UPROBE_FILTER_REGISTER, + UPROBE_FILTER_UNREGISTER, + UPROBE_FILTER_MMAP, +}; + struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + bool (*filter)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); struct uprobe_consumer *next; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3391208..c2737be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -579,19 +579,21 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc) +static inline bool consumer_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { - return true; /* TODO: !uc->filter || uc->filter(...) */ + return !uc->filter || uc->filter(uc, ctx, mm); } -static bool filter_chain(struct uprobe *uprobe) +static bool filter_chain(struct uprobe *uprobe, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc); + ret = consumer_filter(uc, ctx, mm); if (ret) break; } @@ -772,10 +774,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers)) + if (consumer_filter(uprobe->consumers, + UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe)) + if (!filter_chain(uprobe, + UPROBE_FILTER_UNREGISTER, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -968,7 +972,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe)) { + filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } -- cgit v0.10.2 From da1816b1caeccdff04531e763bb35d7caa3ed19f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Dec 2012 17:49:11 +0100 Subject: uprobes: Teach handler_chain() to filter out the probed task Currrently the are 2 problems with pre-filtering: 1. It is not possible to add/remove a task (mm) after uprobe_register() 2. A forked child inherits all breakpoints and uprobe_consumer can not control this. This patch does the first step to improve the filtering. handler_chain() removes the breakpoints installed by this uprobe from current->mm if all handlers return UPROBE_HANDLER_REMOVE. Note that handler_chain() relies on ->register_rwsem to avoid the race with uprobe_register/unregister which can add/del a consumer, or even remove and then insert the new uprobe at the same address. Perhaps we will add uprobe_apply_mm(uprobe, mm, is_register) and teach copy_mm() to do filter(UPROBE_FILTER_FORK), but I think this change makes sense anyway. Note: instead of checking the retcode from uc->handler, we could add uc->filter(UPROBE_FILTER_BPHIT). But I think this is not optimal to call 2 hooks in a row. This buys nothing, and if handler/filter do something nontrivial they will probably do the same work twice. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index c2df693..95d0002 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -35,6 +35,9 @@ struct inode; # include #endif +#define UPROBE_HANDLER_REMOVE 1 +#define UPROBE_HANDLER_MASK 1 + enum uprobe_filter_ctx { UPROBE_FILTER_REGISTER, UPROBE_FILTER_UNREGISTER, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c2737be..04c104a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -440,16 +440,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) return uprobe; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ - struct uprobe_consumer *uc; - - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) - uc->handler(uc, regs); - up_read(&uprobe->register_rwsem); -} - static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); @@ -882,6 +872,33 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int err = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long vaddr; + loff_t offset; + + if (!valid_vma(vma, false) || + vma->vm_file->f_mapping->host != uprobe->inode) + continue; + + offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + if (uprobe->offset < offset || + uprobe->offset >= offset + vma->vm_end - vma->vm_start) + continue; + + vaddr = offset_to_vaddr(vma, uprobe->offset); + err |= remove_breakpoint(uprobe, mm, vaddr); + } + up_read(&mm->mmap_sem); + + return err; +} + static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { @@ -1435,6 +1452,27 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + int remove = UPROBE_HANDLER_REMOVE; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + int rc = uc->handler(uc, regs); + + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + remove &= rc; + } + + if (remove && uprobe->consumers) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + up_read(&uprobe->register_rwsem); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. -- cgit v0.10.2 From cf31ec3f7fece93f3fce3ee5964e27857141ea47 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:21:22 +0100 Subject: uprobes/x86: Change __skip_sstep() to actually skip the whole insn __skip_sstep() doesn't update regs->ip. Currently this is correct but only "by accident" and it doesn't skip the whole insn. Change it to advance ->ip by the length of the detected 0x66*0x90 sequence. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index c71025b..4e33a35d 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -680,8 +680,11 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->insn[i] == 0x66) continue; - if (auprobe->insn[i] == 0x90) + if (auprobe->insn[i] == 0x90) { + regs->ip = uprobe_get_swbp_addr(regs); + regs->ip += i + 1; return true; + } break; } -- cgit v0.10.2 From 74e59dfc6b19e3472a7c16ad57bc831e6e647895 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:54:08 +0100 Subject: uprobes: Change handle_swbp() to expose bp_vaddr to handler_chain() Change handle_swbp() to set regs->ip = bp_vaddr in advance, this is what consumer->handler() needs but uprobe_get_swbp_addr() is not exported. This also simplifies the code and makes it more consistent across the supported architectures. handle_swbp() becomes the only caller of uprobe_get_swbp_addr(). Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 4e33a35d..0ba4cfb 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -681,7 +681,6 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) continue; if (auprobe->insn[i] == 0x90) { - regs->ip = uprobe_get_swbp_addr(regs); regs->ip += i + 1; return true; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04c104a..f1b8078 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1504,6 +1504,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1511,14 +1515,14 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; + goto out; utask = current->utask; if (!utask) { utask = add_utask(); /* Cannot allocate; re-execute the instruction. */ if (!utask) - goto restart; + goto out; } handler_chain(uprobe, regs); @@ -1531,12 +1535,7 @@ static void handle_swbp(struct pt_regs *regs) return; } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e668024..17d9b2b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,7 +492,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -667,7 +667,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v0.10.2 From c8a82538001e1a68f4a319d5a75de90d1f284731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 17:40:39 +0100 Subject: uprobes: Move alloc_page() from xol_add_vma() to xol_alloc_area() Move alloc_page() from xol_add_vma() to xol_alloc_area() to cleanup the code. This separates the memory allocations and consolidates the -EALREADY cleanups and the error handling. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f1b8078..ea2e2a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1041,22 +1041,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct xol_area *area) { - struct mm_struct *mm; - int ret; - - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) - return -ENOMEM; - - ret = -EALREADY; - mm = current->mm; + struct mm_struct *mm = current->mm; + int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1072,11 +1064,8 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; ret = 0; - -fail: + fail: up_write(&mm->mmap_sem); - if (ret) - __free_page(area->page); return ret; } @@ -1104,21 +1093,26 @@ static struct xol_area *xol_alloc_area(void) area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) - return NULL; + goto out; area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); - if (!area->bitmap) - goto fail; + goto free_area; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + goto free_bitmap; init_waitqueue_head(&area->wq); if (!xol_add_vma(area)) return area; -fail: + __free_page(area->page); + free_bitmap: kfree(area->bitmap); + free_area: kfree(area); - + out: return get_xol_area(current->mm); } -- cgit v0.10.2 From 9b545df809644912552360054c7bbe8b8a9e01fa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 16:39:49 +0100 Subject: uprobes: Fold xol_alloc_area() into get_xol_area() Currently only xol_get_insn_slot() does get_xol_area() + xol_alloc_area(), but this will have more users and we do not want to copy-and-paste this code. This patch simply moves xol_alloc_area() into get_xol_area() to simplify the current and future code. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2e2a8..7e3e5c5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1070,27 +1070,21 @@ static int xol_add_vma(struct xol_area *area) return ret; } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ - struct xol_area *area; - - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ - - return area; -} - /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void) { + struct mm_struct *mm = current->mm; struct xol_area *area; + area = mm->uprobes_state.xol_area; + if (area) + goto ret; + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1113,7 +1107,10 @@ static struct xol_area *xol_alloc_area(void) free_area: kfree(area); out: - return get_xol_area(current->mm); + area = mm->uprobes_state.xol_area; + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return area; } /* @@ -1189,14 +1186,11 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot unsigned long offset; void *vaddr; - area = get_xol_area(current->mm); - if (!area) { - area = xol_alloc_area(); - if (!area) - return 0; - } - current->utask->xol_vaddr = xol_take_insn_slot(area); + area = get_xol_area(); + if (!area) + return 0; + current->utask->xol_vaddr = xol_take_insn_slot(area); /* * Initialize the slot if xol_vaddr points to valid * instruction slot. -- cgit v0.10.2 From 5a2df662aafdabffb2cf3adb780a5adf66dfb3bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 17:03:32 +0100 Subject: uprobes: Turn add_utask() into get_utask() Rename add_utask() into get_utask() and change it to allocate on demand to simplify the caller. Like get_xol_area() it will have more users. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7e3e5c5..16e54d6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1290,23 +1290,18 @@ void uprobe_copy_process(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void) { - struct uprobe_task *utask; - - utask = kzalloc(sizeof *utask, GFP_KERNEL); - if (unlikely(!utask)) - return NULL; - - current->utask = utask; - return utask; + if (!current->utask) + current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + return current->utask; } /* Prepare to single-step probed instruction out of line. */ @@ -1505,13 +1500,9 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = current->utask; - if (!utask) { - utask = add_utask(); - /* Cannot allocate; re-execute the instruction. */ - if (!utask) - goto out; - } + utask = get_utask(); + if (!utask) + goto out; /* re-execute the instruction. */ handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) -- cgit v0.10.2 From a6cb3f6d51253e9cf21a38b17c025018117809d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:00:06 +0100 Subject: uprobes: Do not play with utask in xol_get_insn_slot() pre_ssout()->xol_get_insn_slot() path is confusing and buggy. This patch cleanups the code, the next one fixes the bug. Change xol_get_insn_slot() to only allocate the slot and do nothing more, move the initialization of utask->xol_vaddr/vaddr into pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 16e54d6..8d9c5bc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1176,30 +1176,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) } /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol. * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; unsigned long offset; + unsigned long xol_vaddr; void *vaddr; area = get_xol_area(); if (!area) return 0; - current->utask->xol_vaddr = xol_take_insn_slot(area); - /* - * Initialize the slot if xol_vaddr points to valid - * instruction slot. - */ - if (unlikely(!current->utask->xol_vaddr)) + xol_vaddr = xol_take_insn_slot(area); + if (unlikely(!xol_vaddr)) return 0; - current->utask->vaddr = slot_addr; - offset = current->utask->xol_vaddr & ~PAGE_MASK; + /* Initialize the slot */ + offset = xol_vaddr & ~PAGE_MASK; vaddr = kmap_atomic(area->page); memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); kunmap_atomic(vaddr); @@ -1209,7 +1205,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot */ flush_dcache_page(area->page); - return current->utask->xol_vaddr; + return xol_vaddr; } /* @@ -1306,12 +1302,21 @@ static struct uprobe_task *get_utask(void) /* Prepare to single-step probed instruction out of line. */ static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) - return 0; + struct uprobe_task *utask; + unsigned long xol_vaddr; + + utask = current->utask; + + xol_vaddr = xol_get_insn_slot(uprobe); + if (!xol_vaddr) + return -ENOMEM; + + utask->xol_vaddr = xol_vaddr; + utask->vaddr = bp_vaddr; - return -EFAULT; + return arch_uprobe_pre_xol(&uprobe->arch, regs); } /* -- cgit v0.10.2 From aba51024e7159c93914557caaa2b8cda26331091 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:12:48 +0100 Subject: uprobes: Fix utask->xol_vaddr leak in pre_ssout() pre_ssout() should do xol_free_insn_slot() if arch_uprobe_pre_xol() fails, otherwise nobody will free the allocated slot. Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8d9c5bc..0527379 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1306,6 +1306,7 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask; unsigned long xol_vaddr; + int err; utask = current->utask; @@ -1316,7 +1317,13 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - return arch_uprobe_pre_xol(&uprobe->arch, regs); + err = arch_uprobe_pre_xol(&uprobe->arch, regs); + if (unlikely(err)) { + xol_free_insn_slot(current); + return err; + } + + return 0; } /* -- cgit v0.10.2 From 608e7427c0a06de0d70374a9fd7defc8eb228b7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:20:42 +0100 Subject: uprobes: Do not allocate current->utask unnecessary handle_swbp() does get_utask() before can_skip_sstep() for no reason, we do not need ->utask if can_skip_sstep() succeeds. Move get_utask() to pre_ssout() who actually starts to use it. Move the initialization of utask->active_uprobe/state as well. This way the whole initialization is consolidated in pre_ssout(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0527379..071edcb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1308,7 +1308,9 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) unsigned long xol_vaddr; int err; - utask = current->utask; + utask = get_utask(); + if (!utask) + return -ENOMEM; xol_vaddr = xol_get_insn_slot(uprobe); if (!xol_vaddr) @@ -1323,6 +1325,8 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) return err; } + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; return 0; } @@ -1474,7 +1478,6 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) */ static void handle_swbp(struct pt_regs *regs) { - struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; int uninitialized_var(is_swbp); @@ -1512,19 +1515,12 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; - utask = get_utask(); - if (!utask) - goto out; /* re-execute the instruction. */ - handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) { - utask->active_uprobe = uprobe; - utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - } /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: -- cgit v0.10.2 From af4355e91f15812df8608925738c91be57c580dd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Dec 2012 18:37:11 +0100 Subject: uprobes: Kill the bogus IS_ERR_VALUE(xol_vaddr) check utask->xol_vaddr is either zero or valid, remove the bogus IS_ERR_VALUE() check in xol_free_insn_slot(). Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Acked-by: Srikar Dronamraju diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 071edcb..f6c7062 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1223,8 +1223,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) return; slot_addr = tsk->utask->xol_vaddr; - - if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + if (unlikely(!slot_addr)) return; area = tsk->mm->uprobes_state.xol_area; -- cgit v0.10.2 From e8440c1458ba571bc3fac8a6beb53ff604199f5b Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Sun, 13 Jan 2013 19:03:34 +0100 Subject: uprobes: Add exports for module use The original pull message for uprobes (commit 654443e2) noted: This tree includes uprobes support in 'perf probe' - but SystemTap (and other tools) can take advantage of user probe points as well. In order to actually be usable in module-based tools like SystemTap, the interface needs to be exported. This patch first adds the obvious exports for uprobe_register and uprobe_unregister. Then it also adds one for task_user_regset_view, which is necessary to get the correct state of userspace registers. Signed-off-by: Josh Stone Signed-off-by: Oleg Nesterov diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f6c7062..221fc58 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include /* read_mapping_page */ #include #include +#include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ #include /* try_to_free_swap */ @@ -851,6 +852,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } +EXPORT_SYMBOL_GPL(uprobe_register); /* * uprobe_unregister - unregister a already registered probe. @@ -871,6 +873,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } +EXPORT_SYMBOL_GPL(uprobe_unregister); static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6cbeaae..acbd284 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, kiov->iov_len, kiov->iov_base); } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code. We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view); #endif int ptrace_request(struct task_struct *child, long request, -- cgit v0.10.2 From 84d7ed799fd6c1366547d88ddb8188c65de3b94f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:20:45 +0100 Subject: uprobes/tracing: Fix dentry/mount leak in create_trace_uprobe() create_trace_uprobe() does kern_path() to find ->d_inode, but forgets to do path_put(). We can do this right after igrab(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 17d9b2b..06c22ba 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -253,16 +253,18 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = kstrtoul(arg, 0, &offset); - if (ret) - goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + path_put(&path); + if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + argc -= 2; argv += 2; -- cgit v0.10.2 From 4161824f18ff4f56f46595a4016c7315dd0d24f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:36:24 +0100 Subject: uprobes/tracing: Fully initialize uprobe_trace_consumer before uprobe_register() probe_event_enable() does uprobe_register() and only after that sets utc->tu and tu->consumer/flags. This can race with uprobe_dispatcher() which can miss these assignments or see them out of order. Nothing really bad can happen, but this doesn't look clean/safe. And this does not allow to use uprobe_consumer->filter() we are going to add, it is called by uprobe_register() and it needs utc->tu. Change this code to initialize everything before uprobe_register(), and reset tu->consumer/flags if it fails. We can't race with event_disable(), the caller holds event_mutex, and if we could the code would be wrong anyway. In fact I think uprobe_trace_consumer should die, it buys nothing but complicates the code. We can simply add uprobe_consumer into trace_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 06c22ba..15b8ece 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,17 +552,18 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; + utc->tu = tu; + tu->consumer = utc; + tu->flags |= flag; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { + tu->consumer = NULL; + tu->flags &= ~flag; kfree(utc); - return ret; } - tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; - - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) -- cgit v0.10.2 From 7e4e28c53963e6cfa94d8109bb8f5233c5659048 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2013 17:08:47 +0100 Subject: uprobes/tracing: Ensure inode != NULL in create_trace_uprobe() probe_event_enable/disable() check tu->inode != NULL at the start. This is ugly, if igrab() can fail create_trace_uprobe() should not succeed and "postpone" the failure. And S_ISREG(inode->i_mode) check added by d24d7dbf is not safe. Note: alloc_uprobe() should probably check igrab() != NULL as well. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 15b8ece..f7838cf 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -256,7 +256,7 @@ static int create_trace_uprobe(int argc, char **argv) inode = igrab(path.dentry->d_inode); path_put(&path); - if (!S_ISREG(inode->i_mode)) { + if (!inode || !S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } @@ -544,7 +544,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) struct uprobe_trace_consumer *utc; int ret = 0; - if (!tu->inode || tu->consumer) + if (tu->consumer) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +568,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!tu->consumer) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v0.10.2 From b64b007797c1e6d6b745c93c296ba1d5f4d72d86 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:15:30 +0100 Subject: uprobes/tracing: Introduce is_trace_uprobe_enabled() probe_event_enable/disable() check tu->consumer != NULL to avoid the wrong uprobe_register/unregister(). We are going to kill this pointer and "struct uprobe_trace_consumer", so we add the new helper, is_trace_uprobe_enabled(), which can rely on TP_FLAG_TRACE/TP_FLAG_PROFILE instead. Note: the current logic doesn't look optimal, it is not clear why TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive, we will probably change this later. Also kill the unused TP_FLAG_UPROBE. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 9337086..5c7e09d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7838cf..d6c6e2a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,12 +539,17 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) +{ + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + static int probe_event_enable(struct trace_uprobe *tu, int flag) { struct uprobe_trace_consumer *utc; int ret = 0; - if (tu->consumer) + if (is_trace_uprobe_enabled(tu)) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +573,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v0.10.2 From a932b7381f81235530c3d0acbd3ba2c7537d78e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:47:23 +0100 Subject: uprobes/tracing: Kill uprobe_trace_consumer, embed uprobe_consumer into trace_uprobe trace_uprobe->consumer and "struct uprobe_trace_consumer" add the unnecessary indirection and complicate the code for no reason. This patch simply embeds uprobe_consumer into "struct trace_uprobe", all other changes only fix the compilation errors. Signed-off-by: Oleg Nesterov diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d6c6e2a..9c8babb 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -31,17 +31,11 @@ /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -92,6 +86,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; return tu; error: @@ -546,27 +541,15 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) static int probe_event_enable(struct trace_uprobe *tu, int flag) { - struct uprobe_trace_consumer *utc; int ret = 0; if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) - return -EINTR; - - utc->cons.handler = uprobe_dispatcher; - utc->tu = tu; - tu->consumer = utc; tu->flags |= flag; - - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - tu->consumer = NULL; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) tu->flags &= ~flag; - kfree(utc); - } return ret; } @@ -576,10 +559,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -717,13 +698,9 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v0.10.2 From 1b47aefd9b6bd439a4be43c47acd22987ac22db8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:55:27 +0100 Subject: uprobes/perf: Always increment trace_uprobe->nhit Move tu->nhit++ from uprobe_trace_func() to uprobe_dispatcher(). ->nhit counts how many time we hit the breakpoint inserted by this uprobe, we do not want to loose this info if uprobe was enabled by sys_perf_event_open(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9c8babb..c4e29e1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -476,8 +476,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -701,6 +699,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v0.10.2 From f22c1bb6b4706be3502b378cb14564449b15f983 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 2 Feb 2013 16:27:52 +0100 Subject: perf: Introduce hw_perf_event->tp_target and ->tp_list sys_perf_event_open()->perf_init_event(event) is called before find_get_context(event), this means that event->ctx == NULL when class->reg(TRACE_REG_PERF_REGISTER/OPEN) is called and thus it can't know if this event is per-task or system-wide. This patch adds hw_perf_event->tp_target for PERF_TYPE_TRACEPOINT, this is analogous to PERF_TYPE_BREAKPOINT/bp_target we already have. The patch also moves ->bp_target up so that it can overlap with the new member, this can help the compiler to generate the better code. trace_uprobe_register() will use it for prefiltering to avoid the unnecessary breakpoints in mm's we do not want to trace. ->tp_target doesn't have its own reference, but we can rely on the fact that either sys_perf_event_open() holds a reference, or it is equal to event->ctx->task. So this pointer is always valid until free_event(). Also add the "struct list_head tp_list" into this union. It is not strictly necessary, but it can simplify the next changes and we can add it for free. Signed-off-by: Oleg Nesterov diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 42adf01..e47ee46 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -135,16 +135,21 @@ struct hw_perf_event { struct { /* software */ struct hrtimer hrtimer; }; + struct { /* tracepoint */ + struct task_struct *tp_target; + /* for tp_event->class */ + struct list_head tp_list; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ - struct arch_hw_breakpoint info; - struct list_head bp_list; /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct task_struct *bp_target; + struct arch_hw_breakpoint info; + struct list_head bp_list; }; #endif }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 301079d..e2d4323 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; + + if (attr->type == PERF_TYPE_TRACEPOINT) + event->hw.tp_target = task; #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * hw_breakpoint is a bit difficult here.. */ - if (attr->type == PERF_TYPE_BREAKPOINT) + else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif } -- cgit v0.10.2 From bdf8647c44766590ed02f9a84a450a796558b753 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 19:21:12 +0100 Subject: uprobes: Introduce uprobe_apply() Currently it is not possible to change the filtering constraints after uprobe_register(), so a consumer can not, say, start to trace a task/mm which was previously filtered out, or remove the no longer needed bp's. Introduce uprobe_apply() which simply does register_for_each_vma() again to consult uprobe_consumer->filter() and install/remove the breakpoints. The only complication is that register_for_each_vma() can no longer assume that uprobe->consumers should be consulter if is_register == T, so we change it to accept "struct uprobe_consumer *new" instead. Unlike uprobe_register(), uprobe_apply(true) doesn't do "unregister" if register_for_each_vma() fails, it is up to caller to handle the error. Note: we probably need to cleanup the current interface, it is strange that uprobe_apply/unregister need inode/offset. We should either change uprobe_register() to return "struct uprobe *", or add a private ->uprobe member in uprobe_consumer. And in the long term uprobe_apply() should take a single argument, uprobe or consumer, even "bool add" should go away. Signed-off-by: Oleg Nesterov diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 95d0002..02b83db 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -101,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -124,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { return -ENOSYS; } +static inline int +uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +{ + return -ENOSYS; +} static inline void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 221fc58..a567c8c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -733,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) return curr; } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { + bool is_register = !!new; struct map_info *info; int err = 0; @@ -765,7 +767,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(uprobe->consumers, + if (consumer_filter(new, UPROBE_FILTER_REGISTER, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { @@ -788,7 +790,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, true); + return register_for_each_vma(uprobe, uc); } static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -798,7 +800,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u if (!consumer_del(uprobe, uc)) /* WARN? */ return; - err = register_for_each_vma(uprobe, false); + err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ if (!uprobe->consumers && !err) delete_uprobe(uprobe); @@ -855,6 +857,35 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * EXPORT_SYMBOL_GPL(uprobe_register); /* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc, bool add) +{ + struct uprobe *uprobe; + struct uprobe_consumer *con; + int ret = -ENOENT; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return ret; + + down_write(&uprobe->register_rwsem); + for (con = uprobe->consumers; con && con != uc ; con = con->next) + ; + if (con) + ret = register_for_each_vma(uprobe, add ? uc : NULL); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); + + return ret; +} + +/* * uprobe_unregister - unregister a already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. -- cgit v0.10.2 From 736288ba5016e255869c26296014eeff649971c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 20:58:35 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to track the active perf_event's Introduce "struct trace_uprobe_filter" which records the "active" perf_event's attached to ftrace_event_call. For the start we simply use list_head, we can optimize this later if needed. For example, we do not really need to record an event with ->parent != NULL, we can rely on parent->child_list. And we can certainly do some optimizations for the case when 2 events have the same ->tp_target or tp_target->mm. Change trace_uprobe_register() to process TRACE_REG_PERF_OPEN/CLOSE and add/del this perf_event to the list. We can probably avoid any locking, but lets start with the "obvioulsy correct" trace_uprobe_filter->rwlock which protects everything. Signed-off-by: Oleg Nesterov diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4e29e1..2a74a93 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,12 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ @@ -35,6 +41,7 @@ struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; + struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; @@ -58,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -87,6 +106,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -544,6 +564,8 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) if (is_trace_uprobe_enabled(tu)) return -EINTR; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + tu->flags |= flag; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) @@ -557,6 +579,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; } @@ -632,6 +656,30 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_add(&event->hw.tp_list, &tu->filter.perf_events); + else + tu->filter.nr_systemwide++; + write_unlock(&tu->filter.rwlock); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_del(&event->hw.tp_list); + else + tu->filter.nr_systemwide--; + write_unlock(&tu->filter.rwlock); + + return 0; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -687,6 +735,13 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; -- cgit v0.10.2 From 31ba334836c0ac0039084859f14a5b96858493dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:11:58 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to pre-filter Finally implement uprobe_perf_filter() which checks ->nr_systemwide or ->perf_events to figure out whether we need to insert the breakpoint. uprobe_perf_open/close are changed to do uprobe_apply(true/false) when the new perf event comes or goes away. Note that currently this is very suboptimal: - uprobe_register() called by TRACE_REG_PERF_REGISTER becomes a heavy nop, consumer->filter() always returns F at this stage. As it was already discussed we need uprobe_register_only() to avoid the costly register_for_each_vma() when possible. - uprobe_apply() is oftenly overkill. Unless "nr_systemwide != 0" changes we need uprobe_apply_mm(), unapply_uprobe() is almost what we need. - uprobe_apply() can be simply avoided sometimes, see the next changes. Testing: # perf probe -x /lib/libc.so.6 syscall # perl -e 'syscall -1 while 1' & [1] 530 # perf record -e probe_libc:syscall perl -e 'syscall -1 for 1..10; sleep 1' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 79291 A huge ->nrhit == 79291 reflects the fact that the background process 530 constantly hits this breakpoint too, even if doesn't contribute to the output. After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 10 This shows that only the target process was punished by int3. Signed-off-by: Oleg Nesterov diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a74a93..b7850f5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,12 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; @@ -567,6 +572,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) tu->flags &= ~flag; @@ -656,6 +662,22 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { write_lock(&tu->filter.rwlock); @@ -665,6 +687,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide++; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return 0; } @@ -677,9 +701,25 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide--; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return 0; } +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -722,7 +762,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -730,7 +770,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); -- cgit v0.10.2 From f42d24a1d20d2e72d1e5d48930f18b138dfad117 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:48:34 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to use UPROBE_HANDLER_REMOVE Change uprobe_trace_func() and uprobe_perf_func() to return "int". Change uprobe_dispatcher() to return "trace_ret | perf_ret" although this is not needed, currently TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive. The only functional change is that uprobe_perf_func() checks the filtering too and returns UPROBE_HANDLER_REMOVE if nobody wants to trace current. Testing: # perf probe -x /lib/libc.so.6 syscall # perf record -e probe_libc:syscall -i perl -e 'fork; syscall -1 for 1..10; wait' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 20 A child process doesn't have a counter, but still it hits this breakoint "copied" by dup_mmap(). After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 11 The child process hits this int3 only once and does unapply_uprobe(). Signed-off-by: Oleg Nesterov diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b7850f5..2399f14 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -486,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -504,7 +504,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(task_pt_regs(current)); @@ -514,6 +514,8 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -721,7 +723,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -730,11 +732,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -752,6 +757,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -792,18 +798,19 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { -- cgit v0.10.2 From b2fe8ba674e8acbb9e8e63510b802c6d054d88a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 19:05:43 +0100 Subject: uprobes/perf: Avoid uprobe_apply() whenever possible uprobe_perf_open/close call the costly uprobe_apply() every time, we can avoid it if: - "nr_systemwide != 0" is not changed. - There is another process/thread with the same ->mm. - copy_proccess() does inherit_event(). dup_mmap() preserves the inserted breakpoints. - event->attr.enable_on_exec == T, we can rely on uprobe_mmap() called by exec/mmap paths. - tp_target is exiting. Only _close() checks PF_EXITING, I don't think TRACE_REG_PERF_OPEN can hit the dying task too often. Signed-off-by: Oleg Nesterov diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2399f14..8dad2a9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -680,30 +680,60 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return false; } +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); list_add(&event->hw.tp_list, &tu->filter.perf_events); - else + } else { + done = tu->filter.nr_systemwide; tu->filter.nr_systemwide++; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { list_del(&event->hw.tp_list); - else + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -- cgit v0.10.2