From e71b9dff0634edb127f449e076e883ef24a8c76c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:20 +0200 Subject: ima: use file_dentry() Ima tries to call ->setxattr() on overlayfs dentry after having locked underlying inode, which results in a deadlock. Reported-by: Krisztian Litkey Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay") Signed-off-by: Miklos Szeredi Cc: # v4.2 Cc: Mimi Zohar diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 4b9b4a4..ef1e4e7 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -190,7 +190,7 @@ int ima_appraise_measurement(enum ima_hooks func, { static const char op[] = "appraise_data"; char *cause = "unknown"; - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len, hash_start = 0; @@ -295,7 +295,7 @@ out: */ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 596ef61..423d111 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -228,7 +228,7 @@ static int process_measurement(struct file *file, char *buf, loff_t size, if ((action & IMA_APPRAISE_SUBMASK) || strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) /* read 'security.ima' */ - xattr_len = ima_read_xattr(file->f_path.dentry, &xattr_value); + xattr_len = ima_read_xattr(file_dentry(file), &xattr_value); hash_algo = ima_get_hash_algo(xattr_value, xattr_len); -- cgit v0.10.2 From f2b20f6ee842313a0d681dbbf7f87b70291a6a3b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:20 +0200 Subject: vfs: move permission checking into notify_change() for utimes(NULL) This fixes a bug where the permission was not properly checked in overlayfs. The testcase is ltp/utimensat01. It is also cleaner and safer to do the permission checking in the vfs helper instead of the caller. This patch introduces an additional ia_valid flag ATTR_TOUCH (since touch(1) is the most obvious user of utimes(NULL)) that is passed into notify_change whenever the conditions for this special permission checking mode are met. Reported-by: Aihua Zhang Signed-off-by: Miklos Szeredi Tested-by: Aihua Zhang Cc: # v3.18+ diff --git a/fs/attr.c b/fs/attr.c index 42bb42b..3c42cab 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -202,6 +202,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/utimes.c b/fs/utimes.c index 794f5f5..ba54b9e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -87,21 +87,7 @@ static int utimes_common(struct path *path, struct timespec *times) */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { - /* - * If times is NULL (or both times are UTIME_NOW), - * then we need to check permissions, because - * inode_change_ok() won't do it. - */ - error = -EPERM; - if (IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - - error = -EACCES; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); @@ -113,7 +99,6 @@ retry_deleg: goto retry_deleg; } -mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; diff --git a/include/linux/fs.h b/include/linux/fs.h index 901e25d..7c39136 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -224,6 +224,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +#define ATTR_TOUCH (1 << 17) /* * Whiteout is represented by a char device. The following constants define the -- cgit v0.10.2 From 598e3c8f72f5b77c84d2cb26cfd936ffb3cfdbaa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:20 +0200 Subject: vfs: update ovl inode before relatime check On overlayfs relatime_need_update() needs inode times to be correct on overlay inode. But i_mtime and i_ctime are updated by filesystem code on underlying inode only, so they will be out-of-date on the overlay inode. This patch copies the times from the underlying inode if needed. This can't be done if called from RCU lookup (link following) but link m/ctime are not updated by fs, so this is all right. This patch doesn't change functionality for anything but overlayfs. Signed-off-by: Miklos Szeredi diff --git a/fs/inode.c b/fs/inode.c index 7e3ef3a..4a1fc16 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1536,16 +1536,36 @@ sector_t bmap(struct inode *inode, sector_t block) EXPORT_SYMBOL(bmap); /* + * Update times in overlayed inode from underlying real inode + */ +static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, + bool rcu) +{ + if (!rcu) { + struct inode *realinode = d_real_inode(dentry); + + if (unlikely(inode != realinode) && + (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + } + } +} + +/* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, - struct timespec now) +static int relatime_need_update(const struct path *path, struct inode *inode, + struct timespec now, bool rcu) { - if (!(mnt->mnt_flags & MNT_RELATIME)) + if (!(path->mnt->mnt_flags & MNT_RELATIME)) return 1; + + update_ovl_inode_times(path->dentry, inode, rcu); /* * Is mtime younger than atime? If yes, update atime: */ @@ -1612,7 +1632,8 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -bool atime_needs_update(const struct path *path, struct inode *inode) +bool __atime_needs_update(const struct path *path, struct inode *inode, + bool rcu) { struct vfsmount *mnt = path->mnt; struct timespec now; @@ -1638,7 +1659,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) now = current_fs_time(inode->i_sb); - if (!relatime_need_update(mnt, inode, now)) + if (!relatime_need_update(path, inode, now, rcu)) return false; if (timespec_equal(&inode->i_atime, &now)) @@ -1653,7 +1674,7 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec now; - if (!atime_needs_update(path, inode)) + if (!__atime_needs_update(path, inode, false)) return; if (!sb_start_write_trylock(inode->i_sb)) diff --git a/fs/internal.h b/fs/internal.h index ba07376..a63da5e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -120,6 +120,15 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); +extern bool __atime_needs_update(const struct path *, struct inode *, bool); +static inline bool atime_needs_update_rcu(const struct path *path, + struct inode *inode) +{ + return __atime_needs_update(path, inode, true); +} + +extern bool atime_needs_update_rcu(const struct path *, struct inode *); + /* * fs-writeback.c */ diff --git a/fs/namei.c b/fs/namei.c index adb0414..4bbcae1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { + } else if (atime_needs_update_rcu(&last->link, inode)) { if (unlikely(unlazy_walk(nd, NULL, 0))) return ERR_PTR(-ECHILD); touch_atime(&last->link); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7c39136..7db097d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2007,7 +2007,6 @@ enum file_time_flags { S_VERSION = 8, }; -extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); static inline void file_accessed(struct file *file) { -- cgit v0.10.2 From f3fbbb079263bd29ae592478de6808db7e708267 Mon Sep 17 00:00:00 2001 From: Aihua Zhang Date: Thu, 7 Jul 2016 15:37:53 +0800 Subject: fsnotify: support overlayfs When an event occurs direct it to the overlay inode instead of the real underlying inode. This will work even if the file was first on the lower layer and then copied up, while the watch is there. This is because the watch is on the overlay inode, which stays the same through the copy-up. For filesystems other than overlayfs this is a no-op, except for the performance impact of an extra pointer dereferece. Verified to work correctly with the inotify/fanotify tests in LTP. Signed-off-by: Aihua Zhang Signed-off-by: Miklos Szeredi Cc: Jan Kara Cc: Eric Paris diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index eed9e85..b8bcc05 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -29,7 +29,11 @@ static inline int fsnotify_parent(struct path *path, struct dentry *dentry, __u3 static inline int fsnotify_perm(struct file *file, int mask) { struct path *path = &file->f_path; - struct inode *inode = file_inode(file); + /* + * Do not use file_inode() here or anywhere in this file to get the + * inode. That would break *notity on overlayfs. + */ + struct inode *inode = path->dentry->d_inode; __u32 fsnotify_mask = 0; int ret; @@ -173,7 +177,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) static inline void fsnotify_access(struct file *file) { struct path *path = &file->f_path; - struct inode *inode = file_inode(file); + struct inode *inode = path->dentry->d_inode; __u32 mask = FS_ACCESS; if (S_ISDIR(inode->i_mode)) @@ -191,7 +195,7 @@ static inline void fsnotify_access(struct file *file) static inline void fsnotify_modify(struct file *file) { struct path *path = &file->f_path; - struct inode *inode = file_inode(file); + struct inode *inode = path->dentry->d_inode; __u32 mask = FS_MODIFY; if (S_ISDIR(inode->i_mode)) @@ -209,7 +213,7 @@ static inline void fsnotify_modify(struct file *file) static inline void fsnotify_open(struct file *file) { struct path *path = &file->f_path; - struct inode *inode = file_inode(file); + struct inode *inode = path->dentry->d_inode; __u32 mask = FS_OPEN; if (S_ISDIR(inode->i_mode)) @@ -225,7 +229,7 @@ static inline void fsnotify_open(struct file *file) static inline void fsnotify_close(struct file *file) { struct path *path = &file->f_path; - struct inode *inode = file_inode(file); + struct inode *inode = path->dentry->d_inode; fmode_t mode = file->f_mode; __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; -- cgit v0.10.2 From c568d68341be7030f5647def68851e469b21ca11 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:20 +0200 Subject: locks: fix file locking on overlayfs This patch allows flock, posix locks, ofd locks and leases to work correctly on overlayfs. Instead of using the underlying inode for storing lock context use the overlay inode. This allows locks to be persistent across copy-up. This is done by introducing locks_inode() helper and using it instead of file_inode() to get the inode in locking code. For non-overlayfs the two are equivalent, except for an extra pointer dereference in locks_inode(). Since lock operations are in "struct file_operations" we must also make sure not to call underlying filesystem's lock operations. Introcude a super block flag MS_NOREMOTELOCK to this effect. Signed-off-by: Miklos Szeredi Acked-by: Jeff Layton Cc: "J. Bruce Fields" diff --git a/fs/locks.c b/fs/locks.c index ee1b15f..c1656cf 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -139,6 +139,11 @@ #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +static inline bool is_remote_lock(struct file *filp) +{ + return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); +} + static bool lease_breaking(struct file_lock *fl) { return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); @@ -791,7 +796,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -1192,7 +1197,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(file_inode(filp), fl, conflock); + return posix_lock_inode(locks_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1232,7 +1237,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_mandatory_locked(struct file *file) { int ret; - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct file_lock_context *ctx; struct file_lock *fl; @@ -1572,7 +1577,7 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); @@ -1580,7 +1585,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - time_out_leases(file_inode(filp), &dispose); + time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file != filp) continue; @@ -1628,7 +1633,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = file_inode(filp); + struct inode *inode = dentry->d_inode; struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1742,7 +1747,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1782,7 +1787,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1830,7 +1835,7 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - if (filp->f_op->setlease) + if (filp->f_op->setlease && is_remote_lock(filp)) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); @@ -1979,7 +1984,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op->flock) + if (f.file->f_op->flock && is_remote_lock(f.file)) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -2005,7 +2010,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -2129,7 +2134,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -2191,7 +2196,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - inode = file_inode(filp); + inode = locks_inode(filp); /* * This might block, so we do it before checking the inode. @@ -2343,7 +2348,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); + inode = locks_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -2426,6 +2431,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; + struct inode *inode = locks_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2434,7 +2440,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2452,7 +2458,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); - trace_locks_remove_posix(file_inode(filp), &lock, error); + trace_locks_remove_posix(inode, &lock, error); } EXPORT_SYMBOL(locks_remove_posix); @@ -2469,12 +2475,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return; - if (filp->f_op->flock) + if (filp->f_op->flock && is_remote_lock(filp)) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); @@ -2508,7 +2514,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); if (!ctx) return; @@ -2552,7 +2558,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } @@ -2580,7 +2586,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, fl_pid = fl->fl_pid; if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + inode = locks_inode(fl->fl_file); seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { @@ -2682,7 +2688,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int id = 0; diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda..dcd9afe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2700,7 +2700,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME); + MS_STRICTATIME | MS_NOREMOTELOCK); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, diff --git a/fs/open.c b/fs/open.c index 4fd6e25..648fb9d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -726,7 +726,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(inode, f->f_flags); + error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e2a94a2..3d0b9de 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1320,7 +1320,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 7db097d..8ee0f01 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1065,6 +1065,18 @@ struct file_lock_context { extern void send_sigio(struct fown_struct *fown, int fd, int band); +/* + * Return the inode to use for locking + * + * For overlayfs this should be the overlay inode, not the real inode returned + * by file_inode(). For any other fs file_inode(filp) and locks_inode(filp) are + * equal. + */ +static inline struct inode *locks_inode(const struct file *f) +{ + return f->f_path.dentry->d_inode; +} + #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, @@ -1252,7 +1264,7 @@ static inline struct dentry *file_dentry(const struct file *file) static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { - return locks_lock_inode_wait(file_inode(filp), fl); + return locks_lock_inode_wait(locks_inode(filp), fl); } struct fasync_struct { @@ -2155,7 +2167,7 @@ static inline int mandatory_lock(struct inode *ino) static inline int locks_verify_locked(struct file *file) { - if (mandatory_lock(file_inode(file))) + if (mandatory_lock(locks_inode(file))) return locks_mandatory_locked(file); return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 3b00f7c..2473272 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -132,6 +132,7 @@ struct inodes_stat_t { #define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define MS_NOREMOTELOCK (1<<27) #define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) -- cgit v0.10.2 From 7b1742eb06ead6d02a6cf3c44587088e5392d1aa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:20 +0200 Subject: vfs: make argument of d_real_inode() const d_op->d_real() leaves the dentry alone except if the third argument is non-zero. Unfortunately very difficult to explain to the compiler without a cast. Signed-off-by: Miklos Szeredi Acked-by: Jeff Layton diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5ff3e9a..5beed7b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -584,9 +584,10 @@ static inline struct dentry *d_real(struct dentry *dentry, * If dentry is on an union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ -static inline struct inode *d_real_inode(struct dentry *dentry) +static inline struct inode *d_real_inode(const struct dentry *dentry) { - return d_backing_inode(d_real(dentry, NULL, 0)); + /* This usage of d_real() results in const dentry */ + return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0)); } -- cgit v0.10.2 From 4d0c5ba2ff79ef9f5188998b29fd28fcb05f3667 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: vfs: do get_write_access() on upper layer of overlayfs The problem with writecount is: we want consistent handling of it for underlying filesystems as well as overlayfs. Making sure i_writecount is correct on all layers is difficult. Instead this patch makes sure that when write access is acquired, it's always done on the underlying writable layer (called the upper layer). We must also make sure to look at the writecount on this layer when checking for conflicting leases. Open for write already updates the upper layer's writecount. Leaving only truncate. For truncate copy up must happen before get_write_access() so that the writecount is updated on the upper layer. Problem with this is if something fails after that, then copy-up was done needlessly. E.g. if break_lease() was interrupted. Probably not a big deal in practice. Another interesting case is if there's a denywrite on a lower file that is then opened for write or truncated. With this patch these will succeed, which is somewhat counterintuitive. But I think it's still acceptable, considering that the copy-up does actually create a different file, so the old, denywrite mapping won't be touched. On non-overlayfs d_real() is an identity function and d_real_inode() is equivalent to d_inode() so this patch doesn't change behavior in that case. Signed-off-by: Miklos Szeredi Acked-by: Jeff Layton Cc: "J. Bruce Fields" diff --git a/fs/locks.c b/fs/locks.c index c1656cf..b242d5b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1618,7 +1618,8 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && + (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || diff --git a/fs/open.c b/fs/open.c index 648fb9d..8aeb08b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,6 +68,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; + struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -90,7 +91,17 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - error = get_write_access(inode); + /* + * If this is an overlayfs then do as if opening the file so we get + * write access on the upper inode, not on the overlay inode. For + * non-overlay filesystems d_real() is an identity function. + */ + upperdentry = d_real(path->dentry, NULL, O_WRONLY); + error = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto mnt_drop_write_and_out; + + error = get_write_access(upperdentry->d_inode); if (error) goto mnt_drop_write_and_out; @@ -109,7 +120,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(inode); + put_write_access(upperdentry->d_inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: -- cgit v0.10.2 From f0312210010bf063c29efe112b0d9accbc9191b3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: btrfs: use filemap_check_errors() Signed-off-by: Miklos Szeredi Reviewed-by: Omar Sandoval Cc: Chris Mason diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 33fe035..e62fd50 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3161,7 +3161,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fea31a4..4843cb9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2040,7 +2040,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * flags for any errors that might have happened while doing * writeback of file data. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); inode_unlock(inode); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e6811c4..0207622 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10543,21 +10543,6 @@ out_inode: } -/* Inspired by filemap_check_errors() */ -int btrfs_inode_check_errors(struct inode *inode) -{ - int ret = 0; - - if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && - test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) - ret = -ENOSPC; - if (test_bit(AS_EIO, &inode->i_mapping->flags) && - test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) - ret = -EIO; - - return ret; -} - static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ef9c55b..8a84ebd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3961,7 +3961,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, * i_mapping flags, so that the next fsync won't get * an outdated io error too. */ - btrfs_inode_check_errors(inode); + filemap_check_errors(inode->i_mapping); *ordered_io_error = true; break; } @@ -4198,7 +4198,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * without writing to the log tree and the fsync must report the * file data write error and not commit the current transaction. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); if (ret) ctx->io_err = ret; process: -- cgit v0.10.2 From 280db3c88c5ff03c2554d1503451352fde8a2cf3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: f2fs: use filemap_check_errors() Signed-off-by: Miklos Szeredi Cc: Jaegeuk Kim diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f75d197..67ed219 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1513,7 +1513,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; - int ret2 = 0, ret = 0; + int ret2, ret = 0; pagevec_init(&pvec, 0); @@ -1542,10 +1542,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) - ret2 = -ENOSPC; - if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) - ret2 = -EIO; + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; return ret; -- cgit v0.10.2 From beaf226b863a0bea28f2a6985555401450e968b2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: posix_acl: don't ignore return value of posix_acl_create_masq() Signed-off-by: Miklos Szeredi Cc: Andreas Gruenbacher diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 59d47ab0..ea3eb6f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -598,13 +598,14 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (IS_ERR(p)) return PTR_ERR(p); + ret = -ENOMEM; clone = posix_acl_clone(p, GFP_NOFS); if (!clone) - goto no_mem; + goto err_release; ret = posix_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto err_release_clone; if (ret == 0) posix_acl_release(clone); @@ -618,11 +619,11 @@ posix_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +err_release_clone: posix_acl_release(clone); -no_mem: +err_release: posix_acl_release(p); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(posix_acl_create); -- cgit v0.10.2 From a00be0e31f8df453ecbaaa4ba78d2ef935ab252e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: cifs: don't use ->d_time Use d_fsdata instead, which is the same size. Introduce helpers to hide the typecasts. Signed-off-by: Miklos Szeredi Cc: Steve French diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9dcf974..c9c00a8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,16 @@ cifs_uniqueid_to_ino_t(u64 fileid) } +static inline void cifs_set_time(struct dentry *dentry, unsigned long time) +{ + dentry->d_fsdata = (void *) time; +} + +static inline unsigned long cifs_get_time(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4716c54..789ff1d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -40,7 +40,7 @@ renew_parental_timestamps(struct dentry *direntry) /* BB check if there is a way to get the kernel to do this or if we really need this */ do { - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); direntry = direntry->d_parent; } while (!IS_ROOT(direntry)); } @@ -802,7 +802,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } else if (rc == -ENOENT) { rc = 0; - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); d_add(direntry, NULL); /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ @@ -862,7 +862,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + if (time_after(jiffies, cifs_get_time(direntry) + HZ) || !lookupCacheEnabled) return 0; return 1; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b87efd0..0b4a355 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1951,7 +1951,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", full_path, inode, inode->i_count.counter, - dentry, dentry->d_time, jiffies); + dentry, cifs_get_time(dentry), jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); -- cgit v0.10.2 From 814184fd402557f3e5960db469157ccdf1fb69da Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 16 Sep 2016 12:44:21 +0200 Subject: vfat: don't use ->d_time Use d_fsdata instead, which is the same size. Introduce helpers to hide the typecasts. Signed-off-by: Miklos Szeredi Cc: OGAWA Hirofumi diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 92b7363..4afdc3f 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -21,6 +21,17 @@ #include #include "fat.h" +static inline unsigned long vfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void vfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + /* * If new entry was created in the parent, it could create the 8.3 * alias (the shortname of logname). So, the parent may have the @@ -33,7 +44,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != d_inode(dentry->d_parent)->i_version) + if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -759,7 +770,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); if (!inode) - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -823,7 +834,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -849,7 +860,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); -- cgit v0.10.2