From faa9560ae76ef50a3cbfb1a6afc0343fd8172374 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:49 -0400 Subject: fanotify: do not dereference inode_mark when it is unset The fanotify code is supposed to get the group from the mark. It accidentally only used the inode_mark. If the vfsmount_mark was set but not the inode_mark it would deref the NULL inode_mark. Get the group from the correct place. Reported-by: Tvrtko Ursulin Signed-off-by: Eric Paris diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3970392..f3e3b35 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, const unsigned char *file_name, struct fsnotify_event **event) { - struct fsnotify_group *group = inode_mark->group; + struct fsnotify_group *group = NULL; __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); - pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" - " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, - mnt, inode_mark, mask, data, data_is, cookie, *event); + if (unlikely(!inode_mark && !vfsmount_mark)) { + BUG(); + return 0; + } /* clear ignored on inode modification */ if (mask & FS_MODIFY) { @@ -168,18 +169,24 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, /* does the inode mark tell us to do something? */ if (inode_mark) { + group = inode_mark->group; inode_test_mask &= inode_mark->mask; inode_test_mask &= ~inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { + group = vfsmount_mark->group; vfsmount_test_mask &= vfsmount_mark->mask; vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; if (inode_mark) vfsmount_test_mask &= ~inode_mark->ignored_mask; } + pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" + " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, + mnt, inode_mark, mask, data, data_is, cookie, *event); + if (!inode_test_mask && !vfsmount_test_mask) return 0; -- cgit v0.10.2 From 5f3f259fa8f1d7969360acfad5307d03c2f53d63 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:49 -0400 Subject: fsnotify: reset used_inode and used_vfsmount on each pass The fsnotify main loop has 2 booleans which tell if a particular mark was sent to the listeners or if it should be processed in the next pass. The problem is that the booleans were not reset on each traversal of the loop. So marks could get skipped even when they were not sent to the notifiers. Reported-by: Tvrtko Ursulin Signed-off-by: Eric Paris diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index f3e3b35..59dc7a0 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -220,7 +220,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_event *event = NULL; struct vfsmount *mnt; int idx, ret = 0; - bool used_inode = false, used_vfsmount = false; + bool used_inode, used_vfsmount; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -261,6 +261,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, } while (inode_node || vfsmount_node) { + used_inode = used_vfsmount = false; + if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), struct fsnotify_mark, i.i_list); -- cgit v0.10.2 From 88b2dbdbed551e4e21fdc8c56a15e198c52274e2 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:50 -0400 Subject: fanotify: add MAINTAINERS entry add myself as the maintainer. Reported-by: Andy Gospodarek Signed-off-by: Eric Paris +S: Maintained +F: fs/notify/fanotify/ +F: include/linux/fanotify.h + FARSYNC SYNCHRONOUS DRIVER M: Kevin Curtis W: http://www.farsite.co.uk/ -- cgit v0.10.2 From 84e1ab4d875922c034db7f4f814ac445a20a14bd Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:50 -0400 Subject: fsnotify: fix ignored mask handling between inode and vfsmount marks The interesting 2 list lockstep walking didn't quite work out if the inode marks only had ignores and the vfsmount list requested events. The code to shortcut list traversal would not run the inode list since it didn't have real event requests. This code forces inode list traversal when a vfsmount mark matches the event type. Maybe we could add an i_fsnotify_ignored_mask field to struct inode to get the shortcut back, but it doesn't seem worth it to grow struct inode again. I bet with the recent changes to lock the way we do now it would actually not be a major perf hit to just drop i_fsnotify_mark_mask altogether. But that is for another day. Signed-off-by: Eric Paris diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 59dc7a0..6f2777c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -149,8 +149,8 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, struct fsnotify_event **event) { struct fsnotify_group *group = NULL; - __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); - __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 inode_test_mask = 0; + __u32 vfsmount_test_mask = 0; if (unlikely(!inode_mark && !vfsmount_mark)) { BUG(); @@ -170,12 +170,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, /* does the inode mark tell us to do something? */ if (inode_mark) { group = inode_mark->group; + inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); inode_test_mask &= inode_mark->mask; inode_test_mask &= ~inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { + vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); group = vfsmount_mark->group; vfsmount_test_mask &= vfsmount_mark->mask; vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; @@ -183,9 +185,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, vfsmount_test_mask &= ~inode_mark->ignored_mask; } - pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" - " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, - mnt, inode_mark, mask, data, data_is, cookie, *event); + pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" + " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " data=%p data_is=%d cookie=%d event=%p\n", + __func__, group, to_tell, mnt, mask, inode_mark, + inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + data_is, cookie, *event); if (!inode_test_mask && !vfsmount_test_mask) return 0; @@ -214,7 +219,7 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const unsigned char *file_name, u32 cookie) { - struct hlist_node *inode_node, *vfsmount_node; + struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; struct fsnotify_event *event = NULL; @@ -245,19 +250,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, (test_mask & to_tell->i_fsnotify_mask)) inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, &fsnotify_mark_srcu); - else - inode_node = NULL; - if (mnt) { - if ((mask & FS_MODIFY) || - (test_mask & mnt->mnt_fsnotify_mask)) - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - else - vfsmount_node = NULL; - } else { - mnt = NULL; - vfsmount_node = NULL; + if (mnt && ((mask & FS_MODIFY) || + (test_mask & mnt->mnt_fsnotify_mask))) { + vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, + &fsnotify_mark_srcu); + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); } while (inode_node || vfsmount_node) { -- cgit v0.10.2 From 2eebf582c9b3106abb9c33f4fc0a347fb9391037 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:50 -0400 Subject: fanotify: flush outstanding perm requests on group destroy When an fanotify listener is closing it may cause a deadlock between the listener and the original task doing an fs operation. If the original task is waiting for a permissions response it will be holding the srcu lock. The listener cannot clean up and exit until after that srcu lock is syncronized. Thus deadlock. The fix introduced here is to stop accepting new permissions events when a listener is shutting down and to grant permission for all outstanding events. Thus the original task will eventually release the srcu lock and the listener can complete shutdown. Reported-by: Andreas Gruenbacher Cc: Andreas Gruenbacher Signed-off-by: Eric Paris diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 032b837..b966b72 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group, re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); + + if (group->fanotify_data.bypass_perm) { + mutex_unlock(&group->fanotify_data.access_mutex); + kmem_cache_free(fanotify_response_event_cache, re); + event->response = FAN_ALLOW; + return 0; + } + list_add_tail(&re->list, &group->fanotify_data.access_list); mutex_unlock(&group->fanotify_data.access_mutex); @@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fanotify_response_event *re, *lre; pr_debug("%s: file=%p group=%p\n", __func__, file, group); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + mutex_lock(&group->fanotify_data.access_mutex); + + group->fanotify_data.bypass_perm = true; + + list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { + pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, + re, re->event); + + list_del_init(&re->list); + re->event->response = FAN_ALLOW; + + kmem_cache_free(fanotify_response_event_cache, re); + } + mutex_unlock(&group->fanotify_data.access_mutex); + + wake_up(&group->fanotify_data.access_waitq); +#endif /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_put_group(group); diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index f0949a5..9854356 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -95,11 +95,4 @@ struct fanotify_response { (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ (long)(meta)->event_len <= (long)(len)) -#ifdef __KERNEL__ - -struct fanotify_wait { - struct fsnotify_event *event; - __s32 fd; -}; -#endif /* __KERNEL__ */ #endif /* _LINUX_FANOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index ed36fb5..e40190d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -156,6 +156,7 @@ struct fsnotify_group { struct mutex access_mutex; struct list_head access_list; wait_queue_head_t access_waitq; + bool bypass_perm; /* protected by access_mutex */ #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ int f_flags; } fanotify_data; -- cgit v0.10.2 From ff8d6e983185ce19fa92bb836eb52b589957be65 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 20 Aug 2010 10:24:18 +0100 Subject: fanotify: drop duplicate pr_debug statement This reminded me... you have two pr_debugs in fanotify_should_send_event which output redundant information. Maybe you intended it like that so it is selectable how much log spam you want, or if not you may want to apply this patch. Signed-off-by: Tvrtko Ursulin Signed-off-by: Eric Paris diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 756566f..85366c7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, inode_mark, vfsmnt_mark, event_mask, data, data_type); - pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n", - __func__, group, vfsmnt_mark, inode_mark, event_mask); - /* sorry, fanotify only gives a damn about files and dirs */ if (!S_ISREG(to_tell->i_mode) && !S_ISDIR(to_tell->i_mode)) -- cgit v0.10.2 From 0fb85621df4f9f7c663c6c77c302e821a832c95e Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 20 Aug 2010 10:02:15 +0100 Subject: fanotify: resize pid and reorder structure resize pid and reorder the fanotify_event_metadata so it is naturally aligned and we can work towards dropping the packed attributed Signed-off-by: Tvrtko Ursulin Cc: Andreas Dilger Signed-off-by: Eric Paris diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 9854356..63531a6 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -65,14 +65,14 @@ FAN_ALL_PERM_EVENTS |\ FAN_Q_OVERFLOW) -#define FANOTIFY_METADATA_VERSION 1 +#define FANOTIFY_METADATA_VERSION 2 struct fanotify_event_metadata { __u32 event_len; __u32 vers; - __s32 fd; __u64 mask; - __s64 pid; + __s32 fd; + __s32 pid; } __attribute__ ((packed)); struct fanotify_response { -- cgit v0.10.2 From a2f13ad0ba5d94b9768c28469b45ca1e81a2b895 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 24 Aug 2010 12:58:54 +0200 Subject: fanotify: Return EPERM when a process is not privileged The appropriate error code when privileged operations are denied is EPERM, not EACCES. Signed-off-by: Andreas Gruenbacher Signed-off-by: Eric Paris diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b966b72..5ed8e58 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -641,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; -- cgit v0.10.2 From f72adfd540bacc4f6ff57a7d708b1a6c8906bdb4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 27 Aug 2010 21:24:24 -0400 Subject: fsnotify: fix list walk order Marks were stored on the inode and vfsmonut mark list in order from highest memory address to lowest memory address. The code to walk those lists thought they were in order from lowest to highest with unpredictable results when trying to match up marks from each. It was possible that extra events would be sent to userspace when inode marks ignoring events wouldn't get matched with the vfsmount marks. This problem only affected fanotify when using both vfsmount and inode marks simultaneously. Signed-off-by: Eric Paris diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6f2777c..2169aa5 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -261,27 +261,26 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, while (inode_node || vfsmount_node) { used_inode = used_vfsmount = false; + inode_group = vfsmount_group = NULL; if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), struct fsnotify_mark, i.i_list); inode_group = inode_mark->group; - } else - inode_group = (void *)-1; + } if (vfsmount_node) { vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), struct fsnotify_mark, m.m_list); vfsmount_group = vfsmount_mark->group; - } else - vfsmount_group = (void *)-1; + } - if (inode_group < vfsmount_group) { + if (inode_group > vfsmount_group) { /* handle inode */ send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); used_inode = true; - } else if (vfsmount_group < inode_group) { + } else if (vfsmount_group > inode_group) { send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); used_vfsmount = true; -- cgit v0.10.2 From 92b4678efa8ce0de9b1e01a74e3d13c4002a4136 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 27 Aug 2010 21:42:11 -0400 Subject: fsnotify: drop two useless bools in the fnsotify main loop The fsnotify main loop has 2 bools which indicated if we processed the inode or vfsmount mark in that particular pass through the loop. These bool can we replaced with the inode_group and vfsmount_group variables and actually make the code a little easier to understand. Signed-off-by: Eric Paris diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2169aa5..3680242 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -225,7 +225,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_event *event = NULL; struct vfsmount *mnt; int idx, ret = 0; - bool used_inode, used_vfsmount; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -260,7 +259,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, } while (inode_node || vfsmount_node) { - used_inode = used_vfsmount = false; inode_group = vfsmount_group = NULL; if (inode_node) { @@ -279,23 +277,22 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, /* handle inode */ send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); - used_inode = true; + /* we didn't use the vfsmount_mark */ + vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; + inode_group = NULL; } else { send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; - used_inode = true; } - if (used_inode) + if (inode_group) inode_node = srcu_dereference(inode_node->next, &fsnotify_mark_srcu); - if (used_vfsmount) + if (vfsmount_group) vfsmount_node = srcu_dereference(vfsmount_node->next, &fsnotify_mark_srcu); } -- cgit v0.10.2