diff options
Diffstat (limited to 'fs')
533 files changed, 22556 insertions, 8902 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 5b6a174..b3c2cc7 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,32 +276,26 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - umode_t mode = inode->i_mode; - retval = posix_acl_equiv_mode(acl, &mode); - if (retval < 0) + struct iattr iattr; + + retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + if (retval) goto err_out; - else { - struct iattr iattr; - if (retval == 0) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - acl = NULL; - value = NULL; - size = 0; - } - /* Updte the mode bits */ - iattr.ia_mode = ((mode & S_IALLUGO) | - (inode->i_mode & ~S_IALLUGO)); - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? + if (!acl) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + value = NULL; + size = 0; } + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 6877050..443d12e 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -148,7 +148,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8b1999b..30ca770 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -276,7 +276,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->a_ops = &v9fs_addr_operations; switch (mode & S_IFMT) { @@ -955,7 +955,8 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { int retval; struct inode *old_inode; @@ -966,6 +967,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct p9_fid *newdirfid; struct p9_wstat wstat; + if (flags) + return -EINVAL; + p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; old_inode = d_inode(old_dentry); @@ -1094,7 +1098,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(d_inode(dentry), iattr); + retval = setattr_prepare(dentry, iattr); if (retval) return retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index eeabcb0..afaa4b6 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -558,7 +558,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(inode, iattr); + retval = setattr_prepare(dentry, iattr); if (retval) return retval; @@ -967,9 +967,6 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, .get_acl = v9fs_iop_get_acl, }; @@ -977,9 +974,6 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = { const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, .get_acl = v9fs_iop_get_acl, }; @@ -989,8 +983,5 @@ const struct inode_operations v9fs_symlink_inode_operations_dotl = { .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = v9fs_listxattr, }; @@ -79,6 +79,7 @@ config EXPORTFS_BLOCK_OPS config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y + select PERCPU_RWSEM help This option enables standard file locking support, required for filesystems like NFS and for the flock() system @@ -199,6 +200,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config ARCH_HAS_GIGANTIC_PAGE + bool + source "fs/configfs/Kconfig" source "fs/efivarfs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c7efddf..4c09d93 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,7 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU || M68K + depends on !MMU || ARM || M68K depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 335055d..8dbd36f 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -199,7 +199,7 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) return; cur_time: - *tv = CURRENT_TIME; + *tv = current_time(inode); return; too_early: @@ -303,7 +303,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/affs.h b/fs/affs/affs.h index cc2b2ef..2f08877 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -173,7 +173,8 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir, extern int affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); /* inode.c */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index d8f217c..0ec65c1 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -58,7 +58,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); dir->i_version++; mark_inode_dirty(dir); @@ -112,7 +112,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); dir->i_version++; mark_inode_dirty(dir); @@ -313,7 +313,7 @@ affs_remove_header(struct dentry *dentry) else clear_nlink(inode); affs_unlock_link(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); done: diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0fdb0f5..fe4e129 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -219,7 +219,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = inode_change_ok(inode,attr); + error = setattr_prepare(dentry, attr); if (error) goto out; @@ -309,7 +309,7 @@ affs_new_inode(struct inode *dir) inode->i_gid = current_fsgid(); inode->i_ino = block; set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index a2d68f8..29186d2 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -414,12 +414,16 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) int affs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct super_block *sb = old_dir->i_sb; struct buffer_head *bh = NULL; int retval; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7ef637d..1e9d2f8 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -461,8 +461,8 @@ static void afs_callback_updater(struct work_struct *work) */ int __init afs_callback_update_init(void) { - afs_callback_update_worker = - create_singlethread_workqueue("kafs_callbackd"); + afs_callback_update_worker = alloc_ordered_workqueue("kafs_callbackd", + WQ_MEM_RECLAIM); return afs_callback_update_worker ? 0 : -ENOMEM; } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 85737e9..2037e7a 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -17,19 +17,12 @@ #include "internal.h" #include "afs_cm.h" -#if 0 -struct workqueue_struct *afs_cm_workqueue; -#endif /* 0 */ - -static int afs_deliver_cb_init_call_back_state(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_init_call_back_state3(struct afs_call *, - struct sk_buff *, bool); -static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, - struct sk_buff *, bool); +static int afs_deliver_cb_init_call_back_state(struct afs_call *); +static int afs_deliver_cb_init_call_back_state3(struct afs_call *); +static int afs_deliver_cb_probe(struct afs_call *); +static int afs_deliver_cb_callback(struct afs_call *); +static int afs_deliver_cb_probe_uuid(struct afs_call *); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *); static void afs_cm_destructor(struct afs_call *); /* @@ -134,7 +127,7 @@ static void afs_cm_destructor(struct afs_call *call) * received. The step number here must match the final number in * afs_deliver_cb_callback(). */ - if (call->unmarshall == 6) { + if (call->unmarshall == 5) { ASSERT(call->server && call->count && call->request); afs_break_callbacks(call->server, call->count, call->request); } @@ -168,27 +161,27 @@ static void SRXAFSCB_CallBack(struct work_struct *work) /* * deliver request data to a CB.CallBack call */ -static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_callback(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_callback *cb; struct afs_server *server; - struct in_addr addr; __be32 *bp; u32 tmp; int ret, loop; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); call->offset = 0; call->unmarshall++; /* extract the FID array and its count in two steps */ case 1: _debug("extract FID count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -205,8 +198,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, case 2: _debug("extract FID array"); - ret = afs_extract_data(call, skb, last, call->buffer, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, true); if (ret < 0) return ret; @@ -232,7 +225,7 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, /* extract the callback array and its count in two steps */ case 3: _debug("extract CB count"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -242,13 +235,11 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, return -EBADMSG; call->offset = 0; call->unmarshall++; - if (tmp == 0) - goto empty_cb_array; case 4: _debug("extract CB array"); - ret = afs_extract_data(call, skb, last, call->request, - call->count * 3 * 4); + ret = afs_extract_data(call, call->buffer, + call->count * 3 * 4, false); if (ret < 0) return ret; @@ -261,15 +252,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, cb->type = ntohl(*bp++); } - empty_cb_array: call->offset = 0; call->unmarshall++; - case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - /* Record that the message was unmarshalled successfully so * that the call destructor can know do the callback breaking * work, even if the final ACK isn't received. @@ -278,17 +263,15 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb, * updated also. */ call->unmarshall++; - case 6: + case 5: break; } - call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -315,17 +298,17 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) /* * deliver request data to a CB.InitCallBackState call */ -static int afs_deliver_cb_init_call_back_state(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -334,8 +317,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -348,27 +330,68 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call, /* * deliver request data to a CB.InitCallBackState3 call */ -static int afs_deliver_cb_init_call_back_state3(struct afs_call *call, - struct sk_buff *skb, - bool last) +static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) { + struct sockaddr_rxrpc srx; struct afs_server *server; - struct in_addr addr; + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter(""); + + rxrpc_kernel_get_peer(afs_socket, call->rxcall, &srx); + + _enter("{%u}", call->unmarshall); - _enter(",{%u},%d", skb->len, last); + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; - /* There are some arguments that we ignore */ - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + break; + } /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; /* we'll need the file server record as that tells us which set of * vnodes to operate upon */ - memcpy(&addr, &ip_hdr(skb)->saddr, 4); - server = afs_find_server(&addr); + server = afs_find_server(&srx); if (!server) return -ENOTCONN; call->server = server; @@ -393,14 +416,13 @@ static void SRXAFSCB_Probe(struct work_struct *work) /* * deliver request data to a CB.Probe call */ -static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; @@ -426,7 +448,6 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) _enter(""); - if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) reply.match = htonl(0); else @@ -439,19 +460,14 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work) /* * deliver request data to a CB.ProbeUuid call */ -static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cb_probe_uuid(struct afs_call *call) { struct afs_uuid *r; unsigned loop; __be32 *b; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); - - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -463,8 +479,8 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, case 1: _debug("extract UUID"); - ret = afs_extract_data(call, skb, last, call->buffer, - 11 * sizeof(__be32)); + ret = afs_extract_data(call, call->buffer, + 11 * sizeof(__be32), false); switch (ret) { case 0: break; case -EAGAIN: return 0; @@ -491,16 +507,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, call->unmarshall++; case 2: - _debug("trailer"); - if (skb->len != 0) - return -EBADMSG; break; } - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; - call->state = AFS_CALL_REPLYING; INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); @@ -574,14 +583,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) /* * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) { int ret; - _enter(",{%u},%d", skb->len, last); + _enter(""); - ret = afs_data_complete(call, skb, last); + ret = afs_extract_data(call, NULL, 0, false); if (ret < 0) return ret; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index eba5410..51a241e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -38,7 +38,8 @@ static int afs_link(struct dentry *from, struct inode *dir, static int afs_symlink(struct inode *dir, struct dentry *dentry, const char *content); static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry); + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -1083,12 +1084,16 @@ error: * rename a file in an AFS filesystem and/or move it between directories */ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; struct key *key; int ret; + if (flags) + return -EINVAL; + vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/afs/flock.c b/fs/afs/flock.c index d91a9c9..3191dff 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -36,8 +36,8 @@ static int afs_init_lock_manager(void) if (!afs_lock_manager) { mutex_lock(&afs_lock_manager_mutex); if (!afs_lock_manager) { - afs_lock_manager = - create_singlethread_workqueue("kafs_lockd"); + afs_lock_manager = alloc_workqueue("kafs_lockd", + WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) ret = -ENOMEM; } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 9312b92..96f4d76 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -235,16 +235,15 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp, /* * deliver reply data to an FS.FetchStatus */ -static int afs_deliver_fs_fetch_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_status(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -307,8 +306,7 @@ int afs_fs_fetch_file_status(struct afs_server *server, /* * deliver reply data to an FS.FetchData */ -static int afs_deliver_fs_fetch_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_fetch_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; @@ -316,7 +314,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, void *buffer; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -332,7 +330,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, * client) */ case 1: _debug("extract data length (MSW)"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -347,7 +345,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the returned data length */ case 2: _debug("extract data length"); - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -363,10 +361,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, _debug("extract data"); if (call->count > 0) { page = call->reply3; - buffer = kmap_atomic(page); - ret = afs_extract_data(call, skb, last, buffer, - call->count); - kunmap_atomic(buffer); + buffer = kmap(page); + ret = afs_extract_data(call, buffer, + call->count, true); + kunmap(buffer); if (ret < 0) return ret; } @@ -376,8 +374,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, /* extract the metadata */ case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - (21 + 3 + 6) * 4); + ret = afs_extract_data(call, call->buffer, + (21 + 3 + 6) * 4, false); if (ret < 0) return ret; @@ -391,18 +389,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call, call->unmarshall++; case 5: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } if (call->count < PAGE_SIZE) { _debug("clear"); page = call->reply3; - buffer = kmap_atomic(page); + buffer = kmap(page); memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap_atomic(buffer); + kunmap(buffer); } _leave(" = 0 [done]"); @@ -515,13 +510,12 @@ int afs_fs_fetch_data(struct afs_server *server, /* * deliver reply data to an FS.GiveUpCallBacks */ -static int afs_deliver_fs_give_up_callbacks(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_give_up_callbacks(struct afs_call *call) { - _enter(",{%u},%d", skb->len, last); + _enter(""); /* shouldn't be any reply data */ - return afs_data_complete(call, skb, last); + return afs_extract_data(call, NULL, 0, false); } /* @@ -599,16 +593,15 @@ int afs_fs_give_up_callbacks(struct afs_server *server, /* * deliver reply data to an FS.CreateFile or an FS.MakeDir */ -static int afs_deliver_fs_create_vnode(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_create_vnode(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -696,16 +689,15 @@ int afs_fs_create(struct afs_server *server, /* * deliver reply data to an FS.RemoveFile or FS.RemoveDir */ -static int afs_deliver_fs_remove(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_remove(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -777,16 +769,15 @@ int afs_fs_remove(struct afs_server *server, /* * deliver reply data to an FS.Link */ -static int afs_deliver_fs_link(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_link(struct afs_call *call) { struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -863,16 +854,15 @@ int afs_fs_link(struct afs_server *server, /* * deliver reply data to an FS.Symlink */ -static int afs_deliver_fs_symlink(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_symlink(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -968,16 +958,15 @@ int afs_fs_symlink(struct afs_server *server, /* * deliver reply data to an FS.Rename */ -static int afs_deliver_fs_rename(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_rename(struct afs_call *call) { struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1072,16 +1061,15 @@ int afs_fs_rename(struct afs_server *server, /* * deliver reply data to an FS.StoreData */ -static int afs_deliver_fs_store_data(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_data(struct afs_call *call) { struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1251,17 +1239,16 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, /* * deliver reply data to an FS.StoreStatus */ -static int afs_deliver_fs_store_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_store_status(struct afs_call *call) { afs_dataversion_t *store_version; struct afs_vnode *vnode = call->reply; const __be32 *bp; int ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; @@ -1443,14 +1430,13 @@ int afs_fs_setattr(struct afs_server *server, struct key *key, /* * deliver reply data to an FS.GetVolumeStatus */ -static int afs_deliver_fs_get_volume_status(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_get_volume_status(struct afs_call *call) { const __be32 *bp; char *p; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); switch (call->unmarshall) { case 0: @@ -1460,8 +1446,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the returned status record */ case 1: _debug("extract status"); - ret = afs_extract_data(call, skb, last, call->buffer, - 12 * 4); + ret = afs_extract_data(call, call->buffer, + 12 * 4, true); if (ret < 0) return ret; @@ -1472,7 +1458,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the volume name length */ case 2: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1487,8 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 3: _debug("extract volname"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1508,8 +1494,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 4: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1519,7 +1505,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the offline message length */ case 5: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1534,8 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 6: _debug("extract offline"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1555,8 +1541,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->count = 4 - (call->count & 3); case 7: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, true); if (ret < 0) return ret; @@ -1566,7 +1552,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, /* extract the message of the day length */ case 8: - ret = afs_extract_data(call, skb, last, &call->tmp, 4); + ret = afs_extract_data(call, &call->tmp, 4, true); if (ret < 0) return ret; @@ -1581,8 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, case 9: _debug("extract motd"); if (call->count > 0) { - ret = afs_extract_data(call, skb, last, call->reply3, - call->count); + ret = afs_extract_data(call, call->reply3, + call->count, true); if (ret < 0) return ret; } @@ -1595,26 +1581,17 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call, call->unmarshall++; /* extract the message of the day padding */ - if ((call->count & 3) == 0) { - call->unmarshall++; - goto no_motd_padding; - } - call->count = 4 - (call->count & 3); + call->count = (4 - (call->count & 3)) & 3; case 10: - ret = afs_extract_data(call, skb, last, call->buffer, - call->count); + ret = afs_extract_data(call, call->buffer, + call->count, false); if (ret < 0) return ret; call->offset = 0; call->unmarshall++; - no_motd_padding: - case 11: - ret = afs_data_complete(call, skb, last); - if (ret < 0) - return ret; break; } @@ -1685,15 +1662,14 @@ int afs_fs_get_volume_status(struct afs_server *server, /* * deliver reply data to an FS.SetLock, FS.ExtendLock or FS.ReleaseLock */ -static int afs_deliver_fs_xxxx_lock(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_fs_xxxx_lock(struct afs_call *call) { const __be32 *bp; int ret; - _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + _enter("{%u}", call->unmarshall); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index df976b2..5497c84 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -13,13 +13,13 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/skbuff.h> #include <linux/rxrpc.h> #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> #include <linux/backing-dev.h> +#include <net/af_rxrpc.h> #include "afs.h" #include "afs_vl.h" @@ -56,7 +56,7 @@ struct afs_mount_params { */ struct afs_wait_mode { /* RxRPC received message notification */ - void (*rx_wakeup)(struct afs_call *call); + rxrpc_notify_rx_t notify_rx; /* synchronous call waiter and call dispatched notification */ int (*wait)(struct afs_call *call); @@ -75,10 +75,8 @@ struct afs_call { const struct afs_call_type *type; /* type of call */ const struct afs_wait_mode *wait_mode; /* completion wait mode */ wait_queue_head_t waitq; /* processes awaiting completion */ - void (*async_workfn)(struct afs_call *call); /* asynchronous work function */ struct work_struct async_work; /* asynchronous work processor */ struct work_struct work; /* actual work processor */ - struct sk_buff_head rx_queue; /* received packets */ struct rxrpc_call *rxcall; /* RxRPC call handle */ struct key *key; /* security for this call */ struct afs_server *server; /* server affected by incoming CM call */ @@ -92,6 +90,7 @@ struct afs_call { void *reply4; /* reply buffer (fourth part) */ pgoff_t first; /* first page in mapping to deal with */ pgoff_t last; /* last page in mapping to deal with */ + size_t offset; /* offset into received data store */ enum { /* call state */ AFS_CALL_REQUESTING, /* request is being sent for outgoing call */ AFS_CALL_AWAIT_REPLY, /* awaiting reply to outgoing call */ @@ -99,21 +98,18 @@ struct afs_call { AFS_CALL_AWAIT_REQUEST, /* awaiting request data on incoming call */ AFS_CALL_REPLYING, /* replying to incoming call */ AFS_CALL_AWAIT_ACK, /* awaiting final ACK of incoming call */ - AFS_CALL_COMPLETE, /* successfully completed */ - AFS_CALL_BUSY, /* server was busy */ - AFS_CALL_ABORTED, /* call was aborted */ - AFS_CALL_ERROR, /* call failed due to error */ + AFS_CALL_COMPLETE, /* Completed or failed */ } state; int error; /* error code */ + u32 abort_code; /* Remote abort ID or 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ - unsigned reply_size; /* current size of reply */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ - unsigned offset; /* offset into received data store */ unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ + bool need_attention; /* T if RxRPC poked us */ u16 service_id; /* RxRPC service ID to call */ __be16 port; /* target UDP port */ __be32 operation_ID; /* operation ID for an incoming call */ @@ -128,8 +124,7 @@ struct afs_call_type { /* deliver request or reply data to an call * - returning an error will cause the call to be aborted */ - int (*deliver)(struct afs_call *call, struct sk_buff *skb, - bool last); + int (*deliver)(struct afs_call *call); /* map an abort code to an error number */ int (*abort_to_error)(u32 abort_code); @@ -607,29 +602,22 @@ extern void afs_proc_cell_remove(struct afs_cell *); /* * rxrpc.c */ +extern struct socket *afs_socket; + extern int afs_open_socket(void); extern void afs_close_socket(void); -extern void afs_data_consumed(struct afs_call *, struct sk_buff *); extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, const struct afs_wait_mode *); extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, size_t, size_t); extern void afs_flat_call_destructor(struct afs_call *); -extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool); extern void afs_send_empty_reply(struct afs_call *); extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); -extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, - size_t); +extern int afs_extract_data(struct afs_call *, void *, size_t, bool); -static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb, - bool last) +static inline int afs_transfer_reply(struct afs_call *call) { - if (skb->len > 0) - return -EBADMSG; - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - return 0; + return afs_extract_data(call, call->buffer, call->reply_max, false); } /* @@ -654,7 +642,7 @@ do { \ extern struct afs_server *afs_lookup_server(struct afs_cell *, const struct in_addr *); -extern struct afs_server *afs_find_server(const struct in_addr *); +extern struct afs_server *afs_find_server(const struct sockaddr_rxrpc *); extern void afs_put_server(struct afs_server *); extern void __exit afs_purge_servers(void); diff --git a/fs/afs/main.c b/fs/afs/main.c index 35de0c0..0b187ef 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> +#include <linux/random.h> #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 14d04c8..477928b 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -16,34 +16,36 @@ #include "internal.h" #include "afs_cm.h" -static struct socket *afs_socket; /* my RxRPC socket */ +struct socket *afs_socket; /* my RxRPC socket */ static struct workqueue_struct *afs_async_calls; +static struct afs_call *afs_spare_incoming_call; static atomic_t afs_outstanding_calls; -static atomic_t afs_outstanding_skbs; -static void afs_wake_up_call_waiter(struct afs_call *); +static void afs_free_call(struct afs_call *); +static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static int afs_wait_for_call_to_complete(struct afs_call *); -static void afs_wake_up_async_call(struct afs_call *); +static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static int afs_dont_wait_for_call_to_complete(struct afs_call *); -static void afs_process_async_call(struct afs_call *); -static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *); -static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool); +static void afs_process_async_call(struct work_struct *); +static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); +static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static int afs_deliver_cm_op_id(struct afs_call *); /* synchronous call management */ const struct afs_wait_mode afs_sync_call = { - .rx_wakeup = afs_wake_up_call_waiter, + .notify_rx = afs_wake_up_call_waiter, .wait = afs_wait_for_call_to_complete, }; /* asynchronous call management */ const struct afs_wait_mode afs_async_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, .wait = afs_dont_wait_for_call_to_complete, }; /* asynchronous incoming call management */ static const struct afs_wait_mode afs_async_incoming_call = { - .rx_wakeup = afs_wake_up_async_call, + .notify_rx = afs_wake_up_async_call, }; /* asynchronous incoming call initial processing */ @@ -53,17 +55,9 @@ static const struct afs_call_type afs_RXCMxxxx = { .abort_to_error = afs_abort_to_error, }; -static void afs_collect_incoming_call(struct work_struct *); +static void afs_charge_preallocation(struct work_struct *); -static struct sk_buff_head afs_incoming_calls; -static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call); - -static void afs_async_workfn(struct work_struct *work) -{ - struct afs_call *call = container_of(work, struct afs_call, async_work); - - call->async_workfn(call); -} +static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation); static int afs_wait_atomic_t(atomic_t *p) { @@ -83,10 +77,8 @@ int afs_open_socket(void) _enter(""); - skb_queue_head_init(&afs_incoming_calls); - ret = -ENOMEM; - afs_async_calls = create_singlethread_workqueue("kafsd"); + afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); if (!afs_async_calls) goto error_0; @@ -110,13 +102,15 @@ int afs_open_socket(void) if (ret < 0) goto error_2; + rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, + afs_rx_discard_new_call); + ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; - rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); - afs_socket = socket; + afs_charge_preallocation(NULL); _leave(" = 0"); return 0; @@ -136,52 +130,28 @@ void afs_close_socket(void) { _enter(""); + if (afs_spare_incoming_call) { + atomic_inc(&afs_outstanding_calls); + afs_free_call(afs_spare_incoming_call); + afs_spare_incoming_call = NULL; + } + + _debug("outstanding %u", atomic_read(&afs_outstanding_calls)); wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t, TASK_UNINTERRUPTIBLE); _debug("no outstanding calls"); + flush_workqueue(afs_async_calls); + kernel_sock_shutdown(afs_socket, SHUT_RDWR); + flush_workqueue(afs_async_calls); sock_release(afs_socket); _debug("dework"); destroy_workqueue(afs_async_calls); - - ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0); _leave(""); } /* - * Note that the data in a socket buffer is now consumed. - */ -void afs_data_consumed(struct afs_call *call, struct sk_buff *skb) -{ - if (!skb) { - _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("DLVR %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - rxrpc_kernel_data_consumed(call->rxcall, skb); - } -} - -/* - * free a socket buffer - */ -static void afs_free_skb(struct sk_buff *skb) -{ - if (!skb) { - _debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs)); - dump_stack(); - } else { - _debug("FREE %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - if (atomic_dec_return(&afs_outstanding_skbs) == -1) - BUG(); - rxrpc_kernel_free_skb(skb); - } -} - -/* * free a call */ static void afs_free_call(struct afs_call *call) @@ -191,7 +161,6 @@ static void afs_free_call(struct afs_call *call) ASSERTCMP(call->rxcall, ==, NULL); ASSERT(!work_pending(&call->async_work)); - ASSERT(skb_queue_empty(&call->rx_queue)); ASSERT(call->type->name != NULL); kfree(call->request); @@ -207,7 +176,7 @@ static void afs_free_call(struct afs_call *call) static void afs_end_call_nofree(struct afs_call *call) { if (call->rxcall) { - rxrpc_kernel_end_call(call->rxcall); + rxrpc_kernel_end_call(afs_socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) @@ -227,7 +196,7 @@ static void afs_end_call(struct afs_call *call) * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, - size_t request_size, size_t reply_size) + size_t request_size, size_t reply_max) { struct afs_call *call; @@ -241,7 +210,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, call->type = type; call->request_size = request_size; - call->reply_max = reply_size; + call->reply_max = reply_max; if (request_size) { call->request = kmalloc(request_size, GFP_NOFS); @@ -249,14 +218,13 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type, goto nomem_free; } - if (reply_size) { - call->buffer = kmalloc(reply_size, GFP_NOFS); + if (reply_max) { + call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); return call; nomem_free: @@ -325,8 +293,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, * returns from sending the request */ if (first + loop >= last) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(call->rxcall, msg, - to - offset); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, to - offset); kunmap(pages[loop]); if (ret < 0) break; @@ -354,7 +322,6 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct msghdr msg; struct kvec iov[1]; int ret; - struct sk_buff *skb; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -366,8 +333,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, atomic_read(&afs_outstanding_calls)); call->wait_mode = wait_mode; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; @@ -380,7 +346,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, /* create a call */ rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key, - (unsigned long) call, gfp); + (unsigned long) call, gfp, + wait_mode->notify_rx); call->key = NULL; if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); @@ -406,7 +373,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, * request */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size); + ret = rxrpc_kernel_send_data(afs_socket, rxcall, + &msg, call->request_size); if (ret < 0) goto error_do_abort; @@ -421,9 +389,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return wait_mode->wait(call); error_do_abort: - rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); error_kill_call: afs_end_call(call); _leave(" = %d", ret); @@ -431,140 +397,77 @@ error_kill_call: } /* - * Handles intercepted messages that were arriving in the socket's Rx queue. - * - * Called from the AF_RXRPC call processor in waitqueue process context. For - * each call, it is guaranteed this will be called in order of packet to be - * delivered. - */ -static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID, - struct sk_buff *skb) -{ - struct afs_call *call = (struct afs_call *) user_call_ID; - - _enter("%p,,%u", call, skb->mark); - - _debug("ICPT %p{%u} [%d]", - skb, skb->mark, atomic_read(&afs_outstanding_skbs)); - - ASSERTCMP(sk, ==, afs_socket->sk); - atomic_inc(&afs_outstanding_skbs); - - if (!call) { - /* its an incoming call for our callback service */ - skb_queue_tail(&afs_incoming_calls, skb); - queue_work(afs_wq, &afs_collect_incoming_call_work); - } else { - /* route the messages directly to the appropriate call */ - skb_queue_tail(&call->rx_queue, skb); - call->wait_mode->rx_wakeup(call); - } - - _leave(""); -} - -/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { - struct sk_buff *skb; - bool last; u32 abort_code; int ret; - _enter(""); - - while ((call->state == AFS_CALL_AWAIT_REPLY || - call->state == AFS_CALL_AWAIT_OP_ID || - call->state == AFS_CALL_AWAIT_REQUEST || - call->state == AFS_CALL_AWAIT_ACK) && - (skb = skb_dequeue(&call->rx_queue))) { - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: - _debug("Rcv DATA"); - last = rxrpc_kernel_is_data_last(skb); - ret = call->type->deliver(call, skb, last); - switch (ret) { - case -EAGAIN: - if (last) { - _debug("short data"); - goto unmarshal_error; - } - break; - case 0: - ASSERT(last); - if (call->state == AFS_CALL_AWAIT_REPLY) - call->state = AFS_CALL_COMPLETE; - break; - case -ENOTCONN: - abort_code = RX_CALL_DEAD; - goto do_abort; - case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; - goto do_abort; - default: - unmarshal_error: - abort_code = RXGEN_CC_UNMARSHAL; - if (call->state != AFS_CALL_AWAIT_REPLY) - abort_code = RXGEN_SS_UNMARSHAL; - do_abort: - rxrpc_kernel_abort_call(call->rxcall, - abort_code); - call->error = ret; - call->state = AFS_CALL_ERROR; - break; + _enter("%s", call->type->name); + + while (call->state == AFS_CALL_AWAIT_REPLY || + call->state == AFS_CALL_AWAIT_OP_ID || + call->state == AFS_CALL_AWAIT_REQUEST || + call->state == AFS_CALL_AWAIT_ACK + ) { + if (call->state == AFS_CALL_AWAIT_ACK) { + size_t offset = 0; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + NULL, 0, &offset, false, + &call->abort_code); + if (ret == -EINPROGRESS || ret == -EAGAIN) + return; + if (ret == 1 || ret < 0) { + call->state = AFS_CALL_COMPLETE; + goto done; } - break; - case RXRPC_SKB_MARK_FINAL_ACK: - _debug("Rcv ACK"); - call->state = AFS_CALL_COMPLETE; - break; - case RXRPC_SKB_MARK_BUSY: - _debug("Rcv BUSY"); - call->error = -EBUSY; - call->state = AFS_CALL_BUSY; - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Rcv ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = rxrpc_kernel_get_abort_code(skb); - call->error = call->type->abort_to_error(abort_code); - call->state = AFS_CALL_ABORTED; - _debug("Loc ABORT %u -> %d", abort_code, call->error); - break; - case RXRPC_SKB_MARK_NET_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv NET ERROR %d", call->error); - break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - call->error = -rxrpc_kernel_get_error_number(skb); - call->state = AFS_CALL_ERROR; - _debug("Rcv LOCAL ERROR %d", call->error); - break; - default: - BUG(); - break; + return; } - afs_free_skb(skb); - } - - /* make sure the queue is empty if the call is done with (we might have - * aborted the call early because of an unmarshalling error) */ - if (call->state >= AFS_CALL_COMPLETE) { - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); - if (call->incoming) - afs_end_call(call); + ret = call->type->deliver(call); + switch (ret) { + case 0: + if (call->state == AFS_CALL_AWAIT_REPLY) + call->state = AFS_CALL_COMPLETE; + goto done; + case -EINPROGRESS: + case -EAGAIN: + goto out; + case -ENOTCONN: + abort_code = RX_CALL_DEAD; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KNC"); + goto do_abort; + case -ENOTSUPP: + abort_code = RX_INVALID_OPERATION; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, -ret, "KIV"); + goto do_abort; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + default: + abort_code = RXGEN_CC_UNMARSHAL; + if (call->state != AFS_CALL_AWAIT_REPLY) + abort_code = RXGEN_SS_UNMARSHAL; + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + abort_code, EBADMSG, "KUM"); + goto do_abort; + } } +done: + if (call->state == AFS_CALL_COMPLETE && call->incoming) + afs_end_call(call); +out: _leave(""); + return; + +do_abort: + call->error = ret; + call->state = AFS_CALL_COMPLETE; + goto done; } /* @@ -572,7 +475,7 @@ static void afs_deliver_to_call(struct afs_call *call) */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - struct sk_buff *skb; + const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -584,15 +487,18 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) set_current_state(TASK_INTERRUPTIBLE); /* deliver any messages that are in the queue */ - if (!skb_queue_empty(&call->rx_queue)) { + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } + abort_why = "KWC"; ret = call->error; - if (call->state >= AFS_CALL_COMPLETE) + if (call->state == AFS_CALL_COMPLETE) break; + abort_why = "KWI"; ret = -EINTR; if (signal_pending(current)) break; @@ -605,9 +511,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* kill the call */ if (call->state < AFS_CALL_COMPLETE) { _debug("call incomplete"); - rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD); - while ((skb = skb_dequeue(&call->rx_queue))) - afs_free_skb(skb); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_CALL_DEAD, -ret, abort_why); } _debug("call complete"); @@ -619,17 +524,24 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) /* * wake up a waiting call */ -static void afs_wake_up_call_waiter(struct afs_call *call) +static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; wake_up(&call->waitq); } /* * wake up an asynchronous call */ -static void afs_wake_up_async_call(struct afs_call *call) +static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) { - _enter(""); + struct afs_call *call = (struct afs_call *)call_user_ID; + + call->need_attention = true; queue_work(afs_async_calls, &call->async_work); } @@ -647,8 +559,10 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call) /* * delete an asynchronous call */ -static void afs_delete_async_call(struct afs_call *call) +static void afs_delete_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); afs_free_call(call); @@ -658,17 +572,19 @@ static void afs_delete_async_call(struct afs_call *call) /* * perform processing on an asynchronous call - * - on a multiple-thread workqueue this work item may try to run on several - * CPUs at the same time */ -static void afs_process_async_call(struct afs_call *call) +static void afs_process_async_call(struct work_struct *work) { + struct afs_call *call = container_of(work, struct afs_call, async_work); + _enter(""); - if (!skb_queue_empty(&call->rx_queue)) + if (call->state < AFS_CALL_COMPLETE && call->need_attention) { + call->need_attention = false; afs_deliver_to_call(call); + } - if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) { + if (call->state == AFS_CALL_COMPLETE && call->wait_mode) { if (call->wait_mode->async_complete) call->wait_mode->async_complete(call->reply, call->error); @@ -679,122 +595,93 @@ static void afs_process_async_call(struct afs_call *call) /* we can't just delete the call because the work item may be * queued */ - call->async_workfn = afs_delete_async_call; + call->async_work.func = afs_delete_async_call; queue_work(afs_async_calls, &call->async_work); } _leave(""); } -/* - * Empty a socket buffer into a flat reply buffer. - */ -int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last) +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { - size_t len = skb->len; - - if (len > call->reply_max - call->reply_size) { - _leave(" = -EBADMSG [%zu > %u]", - len, call->reply_max - call->reply_size); - return -EBADMSG; - } + struct afs_call *call = (struct afs_call *)user_call_ID; - if (len > 0) { - if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, - len) < 0) - BUG(); - call->reply_size += len; - } - - afs_data_consumed(call, skb); - if (!last) - return -EAGAIN; - - if (call->reply_size != call->reply_max) { - _leave(" = -EBADMSG [%u != %u]", - call->reply_size, call->reply_max); - return -EBADMSG; - } - return 0; + call->rxcall = rxcall; } /* - * accept the backlog of incoming calls + * Charge the incoming call preallocation. */ -static void afs_collect_incoming_call(struct work_struct *work) +static void afs_charge_preallocation(struct work_struct *work) { - struct rxrpc_call *rxcall; - struct afs_call *call = NULL; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&afs_incoming_calls))) { - _debug("new call"); - - /* don't need the notification */ - afs_free_skb(skb); + struct afs_call *call = afs_spare_incoming_call; + for (;;) { if (!call) { call = kzalloc(sizeof(struct afs_call), GFP_KERNEL); - if (!call) { - rxrpc_kernel_reject_call(afs_socket); - return; - } + if (!call) + break; - call->async_workfn = afs_process_async_call; - INIT_WORK(&call->async_work, afs_async_workfn); + INIT_WORK(&call->async_work, afs_process_async_call); call->wait_mode = &afs_async_incoming_call; call->type = &afs_RXCMxxxx; init_waitqueue_head(&call->waitq); - skb_queue_head_init(&call->rx_queue); call->state = AFS_CALL_AWAIT_OP_ID; - - _debug("CALL %p{%s} [%d]", - call, call->type->name, - atomic_read(&afs_outstanding_calls)); - atomic_inc(&afs_outstanding_calls); } - rxcall = rxrpc_kernel_accept_call(afs_socket, - (unsigned long) call); - if (!IS_ERR(rxcall)) { - call->rxcall = rxcall; - call = NULL; - } + if (rxrpc_kernel_charge_accept(afs_socket, + afs_wake_up_async_call, + afs_rx_attach, + (unsigned long)call, + GFP_KERNEL) < 0) + break; + call = NULL; } + afs_spare_incoming_call = call; +} + +/* + * Discard a preallocated call when a socket is shut down. + */ +static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + struct afs_call *call = (struct afs_call *)user_call_ID; - if (call) - afs_free_call(call); + atomic_inc(&afs_outstanding_calls); + call->rxcall = NULL; + afs_free_call(call); +} + +/* + * Notification of an incoming call. + */ +static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + atomic_inc(&afs_outstanding_calls); + queue_work(afs_wq, &afs_charge_preallocation_work); } /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ -static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, - bool last) +static int afs_deliver_cm_op_id(struct afs_call *call) { - size_t len = skb->len; - void *oibuf = (void *) &call->operation_ID; + int ret; - _enter("{%u},{%zu},%d", call->offset, len, last); + _enter("{%zu}", call->offset); ASSERTCMP(call->offset, <, 4); /* the operation ID forms the first four bytes of the request data */ - len = min_t(size_t, len, 4 - call->offset); - if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0) - BUG(); - if (!pskb_pull(skb, len)) - BUG(); - call->offset += len; - - if (call->offset < 4) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; - } + ret = afs_extract_data(call, &call->operation_ID, 4, true); + if (ret < 0) + return ret; call->state = AFS_CALL_AWAIT_REQUEST; + call->offset = 0; /* ask the cache manager to route the call (it'll change the call type * if successful) */ @@ -803,7 +690,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, /* pass responsibility for the remainer of this message off to the * cache manager op */ - return call->type->deliver(call, skb, last); + return call->type->deliver(call); } /* @@ -823,14 +710,15 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); default: afs_end_call(call); _leave(" [error]"); @@ -859,7 +747,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); if (n >= 0) { /* Success */ _leave(" [replied]"); @@ -868,7 +756,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); - rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT); + rxrpc_kernel_abort_call(afs_socket, call->rxcall, + RX_USER_ABORT, ENOMEM, "KOO"); } afs_end_call(call); _leave(" [error]"); @@ -877,25 +766,40 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) /* * Extract a piece of data from the received data socket buffers. */ -int afs_extract_data(struct afs_call *call, struct sk_buff *skb, - bool last, void *buf, size_t count) +int afs_extract_data(struct afs_call *call, void *buf, size_t count, + bool want_more) { - size_t len = skb->len; + int ret; - _enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count); + _enter("{%s,%zu},,%zu,%d", + call->type->name, call->offset, count, want_more); - ASSERTCMP(call->offset, <, count); + ASSERTCMP(call->offset, <=, count); - len = min_t(size_t, len, count - call->offset); - if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 || - !pskb_pull(skb, len)) - BUG(); - call->offset += len; + ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall, + buf, count, &call->offset, + want_more, &call->abort_code); + if (ret == 0 || ret == -EAGAIN) + return ret; - if (call->offset < count) { - afs_data_consumed(call, skb); - _leave(" = -EAGAIN"); - return -EAGAIN; + if (ret == 1) { + switch (call->state) { + case AFS_CALL_AWAIT_REPLY: + call->state = AFS_CALL_COMPLETE; + break; + case AFS_CALL_AWAIT_REQUEST: + call->state = AFS_CALL_REPLYING; + break; + default: + break; + } + return 0; } - return 0; + + if (ret == -ECONNABORTED) + call->error = call->type->abort_to_error(call->abort_code); + else + call->error = ret; + call->state = AFS_CALL_COMPLETE; + return ret; } diff --git a/fs/afs/server.c b/fs/afs/server.c index f342acf..d4066ab 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -178,13 +178,18 @@ server_in_two_cells: /* * look up a server by its IP address */ -struct afs_server *afs_find_server(const struct in_addr *_addr) +struct afs_server *afs_find_server(const struct sockaddr_rxrpc *srx) { struct afs_server *server = NULL; struct rb_node *p; - struct in_addr addr = *_addr; + struct in_addr addr = srx->transport.sin.sin_addr; - _enter("%pI4", &addr.s_addr); + _enter("{%d,%pI4}", srx->transport.family, &addr.s_addr); + + if (srx->transport.family != AF_INET) { + WARN(true, "AFS does not yes support non-IPv4 addresses\n"); + return NULL; + } read_lock(&afs_servers_lock); diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index f94d1ab..94bcd97 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -58,17 +58,16 @@ static int afs_vl_abort_to_error(u32 abort_code) /* * deliver reply data to a VL.GetEntryByXXX call */ -static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call) { struct afs_cache_vlocation *entry; __be32 *bp; u32 tmp; int loop, ret; - _enter(",,%u", last); + _enter(""); - ret = afs_transfer_reply(call, skb, last); + ret = afs_transfer_reply(call); if (ret < 0) return ret; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 5297678..45a8639 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -594,8 +594,8 @@ static void afs_vlocation_reaper(struct work_struct *work) */ int __init afs_vlocation_update_init(void) { - afs_vlocation_update_worker = - create_singlethread_workqueue("kafs_vlupdated"); + afs_vlocation_update_worker = alloc_workqueue("kafs_vlupdated", + WQ_MEM_RECLAIM, 0); return afs_vlocation_update_worker ? 0 : -ENOMEM; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 14d506e..f865c3f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -398,8 +398,7 @@ no_more: switch (ret) { case -EDQUOT: case -ENOSPC: - set_bit(AS_ENOSPC, - &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC); break; case -EROFS: case -EIO: @@ -409,7 +408,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(wb->vnode, true, first, last); - set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO); break; case -EACCES: case -EPERM: @@ -274,14 +274,17 @@ __initcall(aio_setup); static void put_aio_ring_file(struct kioctx *ctx) { struct file *aio_ring_file = ctx->aio_ring_file; + struct address_space *i_mapping; + if (aio_ring_file) { truncate_setsize(aio_ring_file->f_inode, 0); /* Prevent further access to the kioctx from migratepages */ - spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); - aio_ring_file->f_inode->i_mapping->private_data = NULL; + i_mapping = aio_ring_file->f_inode->i_mapping; + spin_lock(&i_mapping->private_lock); + i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + spin_unlock(&i_mapping->private_lock); fput(aio_ring_file); } @@ -17,19 +17,22 @@ #include <linux/ima.h> /** - * inode_change_ok - check if attribute changes to an inode are allowed - * @inode: inode to check + * setattr_prepare - check if attribute changes to a dentry are allowed + * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr - * in the given inode. This includes the normal unix access permission - * checks, as well as checks for rlimits and others. + * in the given dentry. This includes the normal unix access permission + * checks, as well as checks for rlimits and others. The function also clears + * SGID bit from mode if user is not allowed to set it. Also file capabilities + * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int inode_change_ok(const struct inode *inode, struct iattr *attr) +int setattr_prepare(struct dentry *dentry, struct iattr *attr) { + struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* @@ -44,7 +47,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) - return 0; + goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && @@ -77,9 +80,19 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; } +kill_priv: + /* User has permission for the change */ + if (ia_valid & ATTR_KILL_PRIV) { + int error; + + error = security_inode_killpriv(dentry); + if (error) + return error; + } + return 0; } -EXPORT_SYMBOL(inode_change_ok); +EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size @@ -202,6 +215,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ @@ -209,7 +237,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de inode->i_flags &= ~S_NOSEC; } - now = current_fs_time(inode->i_sb); + now = current_time(inode); attr->ia_ctime = now; if (!(ia_valid & ATTR_ATIME_SET)) @@ -217,13 +245,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { - attr->ia_valid &= ~ATTR_KILL_PRIV; - ia_valid &= ~ATTR_KILL_PRIV; error = security_inode_need_killpriv(dentry); - if (error > 0) - error = security_inode_killpriv(dentry); - if (error) + if (error < 0) return error; + if (error == 0) + ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a439548..a1fba42 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -20,7 +20,8 @@ #define AUTOFS_IOC_COUNT 32 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) -#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) +#define AUTOFS_DEV_IOCTL_IOC_COUNT \ + (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) #include <linux/kernel.h> #include <linux/slab.h> @@ -33,8 +34,6 @@ #include <asm/current.h> #include <linux/uaccess.h> -/* #define DEBUG */ - #ifdef pr_fmt #undef pr_fmt #endif @@ -111,8 +110,6 @@ struct autofs_sb_info { int max_proto; unsigned long exp_timeout; unsigned int type; - int reghost_enabled; - int needs_reghost; struct super_block *sb; struct mutex wq_mutex; struct mutex pipe_mutex; @@ -271,4 +268,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -extern void autofs4_kill_sb(struct super_block *); +void autofs4_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index c7fcc74..fc09eb7 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)\n", + "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); @@ -172,6 +172,17 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) return sbi; } +/* Return autofs dev ioctl version */ +static int autofs_dev_ioctl_version(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + /* This should have already been set. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + return 0; +} + /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, @@ -586,41 +597,25 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static struct { - int cmd; - ioctl_fn fn; - } _ioctls[] = { - {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), - autofs_dev_ioctl_protover}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), - autofs_dev_ioctl_protosubver}, - {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), - autofs_dev_ioctl_openmount}, - {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), - autofs_dev_ioctl_closemount}, - {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), - autofs_dev_ioctl_ready}, - {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), - autofs_dev_ioctl_fail}, - {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), - autofs_dev_ioctl_setpipefd}, - {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), - autofs_dev_ioctl_catatonic}, - {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), - autofs_dev_ioctl_timeout}, - {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), - autofs_dev_ioctl_requester}, - {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), - autofs_dev_ioctl_expire}, - {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), - autofs_dev_ioctl_askumount}, - {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), - autofs_dev_ioctl_ismountpoint} + static ioctl_fn _ioctls[] = { + autofs_dev_ioctl_version, + autofs_dev_ioctl_protover, + autofs_dev_ioctl_protosubver, + autofs_dev_ioctl_openmount, + autofs_dev_ioctl_closemount, + autofs_dev_ioctl_ready, + autofs_dev_ioctl_fail, + autofs_dev_ioctl_setpipefd, + autofs_dev_ioctl_catatonic, + autofs_dev_ioctl_timeout, + autofs_dev_ioctl_requester, + autofs_dev_ioctl_expire, + autofs_dev_ioctl_askumount, + autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; } /* ioctl dispatcher */ @@ -642,7 +637,7 @@ static int _autofs_dev_ioctl(unsigned int command, cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || - cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } @@ -655,14 +650,11 @@ static int _autofs_dev_ioctl(unsigned int command, if (err) goto out; - /* The validate routine above always sets the version */ - if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) - goto done; - fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); - return -ENOTTY; + err = -ENOTTY; + goto out; } fp = NULL; @@ -671,9 +663,11 @@ static int _autofs_dev_ioctl(unsigned int command, /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the - * file during close to allow for immediate release. + * file during close to allow for immediate release, + * and the same for retrieving ioctl version. */ - if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { fp = fget(param->ioctlfd); if (!fp) { @@ -706,7 +700,6 @@ cont: if (fp) fput(fp); -done: if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 61b2105..438b5bf 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -274,6 +274,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_dput; } + /* Test versions first */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + pr_err("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { @@ -291,29 +308,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) root_inode->i_fop = &autofs4_root_operations; root_inode->i_op = &autofs4_dir_inode_operations; - /* Couldn't this be tested earlier? */ - if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || - sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; - } - - /* Establish highest kernel protocol version */ - if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) - sbi->version = AUTOFS_MAX_PROTO_VERSION; - else - sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); if (!pipe) { pr_err("could not open pipe file descriptor\n"); - goto fail_dput; + goto fail_put_pid; } ret = autofs_prepare_pipe(pipe); if (ret < 0) @@ -334,14 +334,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) fail_fput: pr_err("pipe file descriptor does not contain proper ops\n"); fput(pipe); - /* fall through */ +fail_put_pid: + put_pid(sbi->oz_pgrp); fail_dput: dput(root); goto fail_free; fail_ino: - kfree(ino); + autofs4_free_ino(ino); fail_free: - put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; return ret; @@ -359,7 +359,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { @@ -368,7 +368,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs4_symlink_inode_operations; - } + } else + WARN_ON(1); return inode; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index fa84bb8..a11f731 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -577,8 +577,6 @@ static int autofs4_dir_symlink(struct inode *dir, inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); - if (!dentry->d_fsdata) - kfree(ino); return -ENOMEM; } inode->i_private = cp; @@ -591,7 +589,7 @@ static int autofs4_dir_symlink(struct inode *dir, if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); - dir->i_mtime = CURRENT_TIME; + dir->i_mtime = current_time(dir); return 0; } @@ -631,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = CURRENT_TIME; + dir->i_mtime = current_time(dir); spin_lock(&sbi->lookup_lock); __autofs4_add_expiring(dentry); @@ -762,7 +760,7 @@ static int autofs4_dir_mkdir(struct inode *dir, if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); - dir->i_mtime = CURRENT_TIME; + dir->i_mtime = current_time(dir); return 0; } @@ -842,7 +840,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - pr_debug("returning %d\n", status); + pr_debug("may umount %d\n", status); status = put_user(status, p); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 431fd7e..e44271d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->dev = autofs4_get_dev(sbi); wq->ino = autofs4_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); + wq->uid = current_real_cred()->uid; + wq->gid = current_real_cred()->gid; wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 3ba385e..8712062 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -100,29 +100,12 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) return -EIO; } -static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, int flags) -{ - return -EIO; -} - -static ssize_t bad_inode_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return -EIO; -} - static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } -static int bad_inode_removexattr(struct dentry *dentry, const char *name) -{ - return -EIO; -} - static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, @@ -133,7 +116,7 @@ static const struct inode_operations bad_inode_ops = .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, - .rename2 = bad_inode_rename2, + .rename = bad_inode_rename2, .readlink = bad_inode_readlink, /* follow_link must be no-op, otherwise unmounting this inode won't work */ @@ -142,10 +125,7 @@ static const struct inode_operations bad_inode_ops = .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, - .setxattr = bad_inode_setxattr, - .getxattr = bad_inode_getxattr, .listxattr = bad_inode_listxattr, - .removexattr = bad_inode_removexattr, }; @@ -173,8 +153,9 @@ void make_bad_inode(struct inode *inode) inode->i_mode = S_IFREG; inode->i_atime = inode->i_mtime = inode->i_ctime = - current_fs_time(inode->i_sb); + current_time(inode); inode->i_op = &bad_inode_ops; + inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); diff --git a/fs/befs/befs.h b/fs/befs/befs.h index e0f59263a..c6bad51 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -43,7 +43,10 @@ struct befs_sb_info { u32 ag_shift; u32 num_ags; - /* jornal log entry */ + /* State of the superblock */ + u32 flags; + + /* Journal log entry */ befs_block_run log_blocks; befs_off_t log_start; befs_off_t log_end; @@ -79,7 +82,7 @@ enum befs_err { BEFS_BT_END, BEFS_BT_EMPTY, BEFS_BT_MATCH, - BEFS_BT_PARMATCH, + BEFS_BT_OVERFLOW, BEFS_BT_NOT_FOUND }; @@ -140,18 +143,6 @@ befs_iaddrs_per_block(struct super_block *sb) return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); } -static inline int -befs_iaddr_is_empty(const befs_inode_addr *iaddr) -{ - return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); -} - -static inline size_t -befs_brun_size(struct super_block *sb, befs_block_run run) -{ - return BEFS_SB(sb)->block_size * run.len; -} - #include "endian.h" #endif /* _LINUX_BEFS_H */ diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 307645f9..7e135ea 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -85,7 +85,7 @@ struct befs_btree_node { }; /* local constants */ -static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; +static const befs_off_t BEFS_BT_INVAL = 0xffffffffffffffffULL; /* local functions */ static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, @@ -156,8 +156,6 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth); sup->data_type = fs32_to_cpu(sb, od_sup->data_type); sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr); - sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr); - sup->max_size = fs64_to_cpu(sb, od_sup->max_size); brelse(bh); if (sup->magic != BEFS_BTREE_MAGIC) { @@ -183,8 +181,8 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, * Calls befs_read_datastream to read in the indicated btree node and * makes sure its header fields are in cpu byteorder, byteswapping if * necessary. - * Note: node->bh must be NULL when this function called first - * time. Don't forget brelse(node->bh) after last call. + * Note: node->bh must be NULL when this function is called the first time. + * Don't forget brelse(node->bh) after last call. * * On success, returns BEFS_OK and *@node contains the btree node that * starts at @node_off, with the node->head fields in cpu byte order. @@ -244,7 +242,7 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, * Read the superblock and rootnode of the b+tree. * Drill down through the interior nodes using befs_find_key(). * Once at the correct leaf node, use befs_find_key() again to get the - * actuall value stored with the key. + * actual value stored with the key. */ int befs_btree_find(struct super_block *sb, const befs_data_stream *ds, @@ -283,9 +281,9 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, while (!befs_leafnode(this_node)) { res = befs_find_key(sb, this_node, key, &node_off); - if (res == BEFS_BT_NOT_FOUND) + /* if no key set, try the overflow node */ + if (res == BEFS_BT_OVERFLOW) node_off = this_node->head.overflow; - /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " "node at %llu", node_off); @@ -293,15 +291,15 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, } } - /* at the correct leaf node now */ - + /* at a leaf node now, check if it is correct */ res = befs_find_key(sb, this_node, key, value); brelse(this_node->bh); kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- %s Key %s not found", __func__, key); + befs_error(sb, "<--- %s Key %s not found", __func__, key); + befs_debug(sb, "<--- %s ERROR", __func__); *value = 0; return BEFS_BT_NOT_FOUND; } @@ -324,16 +322,12 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, * @findkey: Keystring to search for * @value: If key is found, the value stored with the key is put here * - * finds exact match if one exists, and returns BEFS_BT_MATCH - * If no exact match, finds first key in node that is greater - * (alphabetically) than the search key and returns BEFS_BT_PARMATCH - * (for partial match, I guess). Can you think of something better to - * call it? - * - * If no key was a match or greater than the search key, return - * BEFS_BT_NOT_FOUND. + * Finds exact match if one exists, and returns BEFS_BT_MATCH. + * If there is no match and node's value array is too small for key, return + * BEFS_BT_OVERFLOW. + * If no match and node should countain this key, return BEFS_BT_NOT_FOUND. * - * Use binary search instead of a linear. + * Uses binary search instead of a linear. */ static int befs_find_key(struct super_block *sb, struct befs_btree_node *node, @@ -348,18 +342,16 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, befs_debug(sb, "---> %s %s", __func__, findkey); - *value = 0; - findkey_len = strlen(findkey); - /* if node can not contain key, just skeep this node */ + /* if node can not contain key, just skip this node */ last = node->head.all_key_count - 1; thiskey = befs_bt_get_key(sb, node, last, &keylen); eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- %s %s not found", __func__, findkey); - return BEFS_BT_NOT_FOUND; + befs_debug(sb, "<--- node can't contain %s", findkey); + return BEFS_BT_OVERFLOW; } valarray = befs_bt_valarray(node); @@ -387,12 +379,15 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, else first = mid + 1; } + + /* return an existing value so caller can arrive to a leaf node */ if (eq < 0) *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); - return BEFS_BT_PARMATCH; + befs_error(sb, "<--- %s %s not found", __func__, findkey); + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_BT_NOT_FOUND; } /** @@ -405,7 +400,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, * @keysize: Length of the returned key * @value: Value stored with the returned key * - * Heres how it works: Key_no is the index of the key/value pair to + * Here's how it works: Key_no is the index of the key/value pair to * return in keybuf/value. * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is * the number of characters in the key (just a convenience). @@ -422,7 +417,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, { struct befs_btree_node *this_node; befs_btree_super bt_super; - befs_off_t node_off = 0; + befs_off_t node_off; int cur_key; fs64 *valarray; char *keystart; @@ -467,7 +462,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, while (key_sum + this_node->head.all_key_count <= key_no) { /* no more nodes to look in: key_no is too large */ - if (this_node->head.right == befs_bt_inval) { + if (this_node->head.right == BEFS_BT_INVAL) { *keysize = 0; *value = 0; befs_debug(sb, @@ -541,7 +536,6 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, * @node_off: Pointer to offset of current node within datastream. Modified * by the function. * - * * Helper function for btree traverse. Moves the current position to the * start of the first leaf node. * @@ -608,7 +602,7 @@ static int befs_leafnode(struct befs_btree_node *node) { /* all interior nodes (and only interior nodes) have an overflow node */ - if (node->head.overflow == befs_bt_inval) + if (node->head.overflow == BEFS_BT_INVAL) return 1; else return 0; @@ -715,7 +709,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, * * Returns 0 if @key1 and @key2 are equal. * Returns >0 if @key1 is greater. - * Returns <0 if @key2 is greater.. + * Returns <0 if @key2 is greater. */ static int befs_compare_strings(const void *key1, int keylen1, diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index af1bc19..b4c7ba0 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -22,22 +22,22 @@ const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run); + befs_blocknr_t blockno, befs_block_run *run); static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); /** * befs_read_datastream - get buffer_head containing data, starting from pos. * @sb: Filesystem superblock - * @ds: datastrem to find data with + * @ds: datastream to find data with * @pos: start of data * @off: offset of data in buffer_head->b_data * @@ -46,7 +46,7 @@ static int befs_find_brun_dblindirect(struct super_block *sb, */ struct buffer_head * befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, - befs_off_t pos, uint * off) + befs_off_t pos, uint *off) { struct buffer_head *bh; befs_block_run run; @@ -75,7 +75,13 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, return bh; } -/* +/** + * befs_fblock2brun - give back block run for fblock + * @sb: the superblock + * @data: datastream to read from + * @fblock: the blocknumber with the file position to find + * @run: The found run is passed back through this pointer + * * Takes a file position and gives back a brun who's starting block * is block number fblock of the file. * @@ -88,7 +94,7 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, */ int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t fblock, befs_block_run * run) + befs_blocknr_t fblock, befs_block_run *run) { int err; befs_off_t pos = fblock << BEFS_SB(sb)->block_shift; @@ -115,7 +121,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, /** * befs_read_lsmylink - read long symlink from datastream. * @sb: Filesystem superblock - * @ds: Datastrem to read from + * @ds: Datastream to read from * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes * @@ -128,6 +134,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh; + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { @@ -183,13 +190,13 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) metablocks += ds->indirect.len; /* - Double indir block, plus all the indirect blocks it mapps - In the double-indirect range, all block runs of data are - BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know - how many data block runs are in the double-indirect region, - and from that we know how many indirect blocks it takes to - map them. We assume that the indirect blocks are also - BEFS_DBLINDIR_BRUN_LEN blocks long. + * Double indir block, plus all the indirect blocks it maps. + * In the double-indirect range, all block runs of data are + * BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know + * how many data block runs are in the double-indirect region, + * and from that we know how many indirect blocks it takes to + * map them. We assume that the indirect blocks are also + * BEFS_DBLINDIR_BRUN_LEN blocks long. */ if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) { uint dbl_bytes; @@ -212,58 +219,50 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) return blocks; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the direct region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - Linear search. Checks each element of array[] to see if it - contains the blockno-th filesystem block. This is necessary - because the block runs map variable amounts of data. Simply - keeps a count of the number of blocks searched so far (sum), - incrementing this by the length of each block run as we come - across it. Adds sum to *count before returning (this is so - you can search multiple arrays that are logicaly one array, - as in the indirect region code). - - When/if blockno is found, if blockno is inside of a block - run as stored on disk, we offset the start and length members - of the block run, so that blockno is the start and len is - still valid (the run ends in the same place). - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_direct - find a direct block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the direct region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * Linear search. Checks each element of array[] to see if it + * contains the blockno-th filesystem block. This is necessary + * because the block runs map variable amounts of data. Simply + * keeps a count of the number of blocks searched so far (sum), + * incrementing this by the length of each block run as we come + * across it. Adds sum to *count before returning (this is so + * you can search multiple arrays that are logicaly one array, + * as in the indirect region code). + * + * When/if blockno is found, if blockno is inside of a block + * run as stored on disk, we offset the start and length members + * of the block run, so that blockno is the start and len is + * still valid (the run ends in the same place). + */ static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run) + befs_blocknr_t blockno, befs_block_run *run) { int i; const befs_block_run *array = data->direct; befs_blocknr_t sum; - befs_blocknr_t max_block = - data->max_direct_range >> BEFS_SB(sb)->block_shift; befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); - if (blockno > max_block) { - befs_error(sb, "%s passed block outside of direct region", - __func__); - return BEFS_ERR; - } - for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS; sum += array[i].len, i++) { if (blockno >= sum && blockno < sum + (array[i].len)) { int offset = blockno - sum; + run->allocation_group = array[i].allocation_group; run->start = array[i].start + offset; run->len = array[i].len - offset; @@ -275,38 +274,39 @@ befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, } } + befs_error(sb, "%s failed to find file block %lu", __func__, + (unsigned long)blockno); befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - For each block in the indirect run of the datastream, read - it in and search through it for search_blk. - - XXX: - Really should check to make sure blockno is inside indirect - region. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_indirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * For each block in the indirect run of the datastream, read + * it in and search through it for search_blk. + * + * XXX: + * Really should check to make sure blockno is inside indirect + * region. + */ static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int i, j; befs_blocknr_t sum = 0; @@ -326,11 +326,12 @@ befs_find_brun_indirect(struct super_block *sb, /* Examine blocks of the indirect run one at a time */ for (i = 0; i < indirect.len; i++) { - indirblock = befs_bread(sb, indirblockno + i); + indirblock = sb_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, "---> %s failed to read " + befs_error(sb, "---> %s failed to read " "disk block %lu from the indirect brun", __func__, (unsigned long)indirblockno + i); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -370,52 +371,51 @@ befs_find_brun_indirect(struct super_block *sb, return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the double-indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - The block runs in the double-indirect region are different. - They are always allocated 4 fs blocks at a time, so each - block run maps a constant amount of file data. This means - that we can directly calculate how many block runs into the - double-indirect region we need to go to get to the one that - maps a particular filesystem block. - - We do this in two stages. First we calculate which of the - inode addresses in the double-indirect block will point us - to the indirect block that contains the mapping for the data, - then we calculate which of the inode addresses in that - indirect block maps the data block we are after. - - Oh, and once we've done that, we actually read in the blocks - that contain the inode addresses we calculated above. Even - though the double-indirect run may be several blocks long, - we can calculate which of those blocks will contain the index - we are after and only read that one. We then follow it to - the indirect block and perform a similar process to find - the actual block run that maps the data block we are interested - in. - - Then we offset the run as in befs_find_brun_array() and we are - done. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_dblindirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the double-indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * The block runs in the double-indirect region are different. + * They are always allocated 4 fs blocks at a time, so each + * block run maps a constant amount of file data. This means + * that we can directly calculate how many block runs into the + * double-indirect region we need to go to get to the one that + * maps a particular filesystem block. + * + * We do this in two stages. First we calculate which of the + * inode addresses in the double-indirect block will point us + * to the indirect block that contains the mapping for the data, + * then we calculate which of the inode addresses in that + * indirect block maps the data block we are after. + * + * Oh, and once we've done that, we actually read in the blocks + * that contain the inode addresses we calculated above. Even + * though the double-indirect run may be several blocks long, + * we can calculate which of those blocks will contain the index + * we are after and only read that one. We then follow it to + * the indirect block and perform a similar process to find + * the actual block run that maps the data block we are interested + * in. + * + * Then we offset the run as in befs_find_brun_array() and we are + * done. + */ static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int dblindir_indx; int indir_indx; @@ -430,10 +430,9 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *indir_block; befs_block_run indir_run; befs_disk_inode_addr *iaddr_array; - struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = - data->max_indirect_range >> befs_sb->block_shift; + data->max_indirect_range >> BEFS_SB(sb)->block_shift; off_t dbl_indir_off = blockno - indir_start_blk; @@ -471,7 +470,7 @@ befs_find_brun_dblindirect(struct super_block *sb, } dbl_indir_block = - befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + sb_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { befs_error(sb, "%s couldn't read the " @@ -479,7 +478,6 @@ befs_find_brun_dblindirect(struct super_block *sb, (unsigned long) iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); - brelse(dbl_indir_block); return BEFS_ERR; } @@ -499,12 +497,11 @@ befs_find_brun_dblindirect(struct super_block *sb, } indir_block = - befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + sb_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { befs_error(sb, "%s couldn't read the indirect block " "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); - brelse(indir_block); return BEFS_ERR; } diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 4de7cff..85c1339 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -169,6 +169,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, sup->inode_size)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", diff --git a/fs/befs/io.c b/fs/befs/io.c index 523c8af..b4a5581 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -27,7 +27,7 @@ struct buffer_head * befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { struct buffer_head *bh; - befs_blocknr_t block = 0; + befs_blocknr_t block; struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_debug(sb, "---> Enter %s " @@ -59,27 +59,3 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } - -struct buffer_head * -befs_bread(struct super_block *sb, befs_blocknr_t block) -{ - struct buffer_head *bh; - - befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); - - bh = sb_bread(sb, block); - - if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", - (unsigned long)block); - goto error; - } - - befs_debug(sb, "<--- %s", __func__); - - return bh; - - error: - befs_debug(sb, "<--- %s ERROR", __func__); - return NULL; -} diff --git a/fs/befs/io.h b/fs/befs/io.h index 9b78266..78d7bc6 100644 --- a/fs/befs/io.h +++ b/fs/befs/io.h @@ -5,5 +5,3 @@ struct buffer_head *befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr); -struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); - diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7da05b1..647a276 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -120,7 +120,7 @@ befs_get_block(struct inode *inode, sector_t block, struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_block_run run = BAD_IADDR; - int res = 0; + int res; ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", @@ -179,15 +179,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) kfree(utfname); } else { - ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset); + ret = befs_btree_find(sb, ds, name, &offset); } if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); + d_add(dentry, NULL); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- %s Error", __func__); + befs_error(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -211,56 +212,55 @@ befs_readdir(struct file *file, struct dir_context *ctx) befs_off_t value; int result; size_t keysize; - unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", __func__, file, inode->i_ino, ctx->pos); -more: - result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, - keybuf, &keysize, &value); + while (1) { + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, + keybuf, &keysize, &value); - if (result == BEFS_ERR) { - befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %pD (inode %lu)", - file, inode->i_ino); - return -EIO; - - } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- %s END", __func__); - return 0; - - } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- %s Empty directory", __func__); - return 0; - } + if (result == BEFS_ERR) { + befs_debug(sb, "<--- %s ERROR", __func__); + befs_error(sb, "IO error reading %pD (inode %lu)", + file, inode->i_ino); + return -EIO; - d_type = DT_UNKNOWN; + } else if (result == BEFS_BT_END) { + befs_debug(sb, "<--- %s END", __func__); + return 0; - /* Convert to NLS */ - if (BEFS_SB(sb)->nls) { - char *nlsname; - int nlsnamelen; - result = - befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); - if (result < 0) { - befs_debug(sb, "<--- %s ERROR", __func__); - return result; + } else if (result == BEFS_BT_EMPTY) { + befs_debug(sb, "<--- %s Empty directory", __func__); + return 0; } - if (!dir_emit(ctx, nlsname, nlsnamelen, - (ino_t) value, d_type)) { + + /* Convert to NLS */ + if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; + + result = + befs_utf2nls(sb, keybuf, keysize, &nlsname, + &nlsnamelen); + if (result < 0) { + befs_debug(sb, "<--- %s ERROR", __func__); + return result; + } + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, DT_UNKNOWN)) { + kfree(nlsname); + return 0; + } kfree(nlsname); - return 0; + } else { + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, DT_UNKNOWN)) + return 0; } - kfree(nlsname); - } else { - if (!dir_emit(ctx, keybuf, keysize, - (ino_t) value, d_type)) - return 0; + ctx->pos++; } - ctx->pos++; - goto more; } static struct inode * @@ -299,7 +299,6 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct befs_sb_info *befs_sb = BEFS_SB(sb); struct befs_inode_info *befs_ino; struct inode *inode; - long ret = -EIO; befs_debug(sb, "---> %s inode = %lu", __func__, ino); @@ -318,7 +317,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_ino->i_inode_num.allocation_group, befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); - bh = befs_bread(sb, inode->i_ino); + bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " "inode = %lu", inode->i_ino); @@ -421,7 +420,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); befs_debug(sb, "<--- %s - Bad inode", __func__); - return ERR_PTR(ret); + return ERR_PTR(-EIO); } /* Initialize the inode cache. Called at fs setup. @@ -436,10 +435,9 @@ befs_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (befs_inode_cachep == NULL) { - pr_err("%s: Couldn't initialize inode slabcache\n", __func__); + if (befs_inode_cachep == NULL) return -ENOMEM; - } + return 0; } @@ -524,8 +522,6 @@ befs_utf2nls(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); - *out_len = 0; return -ENOMEM; } @@ -604,7 +600,6 @@ befs_nls2utf(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -637,10 +632,6 @@ befs_nls2utf(struct super_block *sb, const char *in, return -EILSEQ; } -/** - * Use the - * - */ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; @@ -760,19 +751,19 @@ befs_fill_super(struct super_block *sb, void *data, int silent) long ret = -EINVAL; const unsigned long sb_block = 0; const off_t x86_sb_off = 512; + int blocksize; save_mount_options(sb, data); sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); - if (sb->s_fs_info == NULL) { - pr_err("(%s): Unable to allocate memory for private " - "portion of superblock. Bailing.\n", sb->s_id); + if (sb->s_fs_info == NULL) goto unacquire_none; - } + befs_sb = BEFS_SB(sb); if (!parse_options((char *) data, &befs_sb->mount_opts)) { - befs_error(sb, "cannot parse mount options"); + if (!silent) + befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } @@ -789,14 +780,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * Will be set to real fs blocksize later. * * Linux 2.4.10 and later refuse to read blocks smaller than - * the hardsect size for the device. But we also need to read at + * the logical block size for the device. But we also need to read at * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ - sb_min_blocksize(sb, 1024); + blocksize = sb_min_blocksize(sb, 1024); + if (!blocksize) { + if (!silent) + befs_error(sb, "unable to set blocksize"); + goto unacquire_priv_sbp; + } if (!(bh = sb_bread(sb, sb_block))) { - befs_error(sb, "unable to read superblock"); + if (!silent) + befs_error(sb, "unable to read superblock"); goto unacquire_priv_sbp; } @@ -820,9 +817,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %llu " - "is larger than the host can use", - befs_sb->num_blocks); + if (!silent) + befs_error(sb, "blocks count: %llu is larger than the host can use", + befs_sb->num_blocks); goto unacquire_priv_sbp; } @@ -841,7 +838,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); if (!sb->s_root) { - befs_error(sb, "get root inode failed"); + if (!silent) + befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } @@ -870,9 +868,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) unacquire_priv_sbp: kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); + sb->s_fs_info = NULL; unacquire_none: - sb->s_fs_info = NULL; return ret; } diff --git a/fs/befs/super.c b/fs/befs/super.c index aeafc4d..7c50025 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -13,24 +13,20 @@ #include "befs.h" #include "super.h" -/** - * load_befs_sb -- Read from disk and properly byteswap all the fields +/* + * befs_load_sb -- Read from disk and properly byteswap all the fields * of the befs superblock - * - * - * - * */ int -befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) +befs_load_sb(struct super_block *sb, befs_super_block *disk_sb) { struct befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) - befs_sb->byte_order = BEFS_BYTESEX_LE; + befs_sb->byte_order = BEFS_BYTESEX_LE; else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); @@ -45,6 +41,8 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift); befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags); + befs_sb->flags = fs32_to_cpu(sb, disk_sb->flags); + befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks); befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start); befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end); @@ -84,15 +82,15 @@ befs_check_sb(struct super_block *sb) } if (befs_sb->block_size > PAGE_SIZE) { - befs_error(sb, "blocksize(%u) cannot be larger" + befs_error(sb, "blocksize(%u) cannot be larger " "than system pagesize(%lu)", befs_sb->block_size, PAGE_SIZE); return BEFS_ERR; } /* - * block_shift and block_size encode the same information - * in different ways as a consistency check. + * block_shift and block_size encode the same information + * in different ways as a consistency check. */ if ((1 << befs_sb->block_shift) != befs_sb->block_size) { @@ -101,10 +99,18 @@ befs_check_sb(struct super_block *sb) return BEFS_ERR; } - if (befs_sb->log_start != befs_sb->log_end) { + + /* ag_shift also encodes the same information as blocks_per_ag in a + * different way, non-fatal consistency check + */ + if ((1 << befs_sb->ag_shift) != befs_sb->blocks_per_ag) + befs_error(sb, "ag_shift disagrees with blocks_per_ag."); + + if (befs_sb->log_start != befs_sb->log_end || + befs_sb->flags == BEFS_DIRTY) { befs_error(sb, "Filesystem not clean! There are blocks in the " - "journal. You must boot into BeOS and mount this volume " - "to make it clean."); + "journal. You must boot into BeOS and mount this " + "volume to make it clean."); return BEFS_ERR; } diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 34a5bc2..3e5ac30 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; @@ -165,7 +165,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, return err; } inc_nlink(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); @@ -194,7 +194,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) } de->ino = 0; mark_buffer_dirty_inode(bh, dir); - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); @@ -207,7 +207,8 @@ out_brelse: } static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; @@ -215,6 +216,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct bfs_sb_info *info; int error = -ENOENT; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_bh = new_bh = NULL; old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) @@ -249,10 +253,10 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } old_de->ino = 0; - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); mark_inode_dirty(old_dir); if (new_inode) { - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = current_time(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); @@ -300,9 +304,9 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name, pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; - dir->i_ctime = CURRENT_TIME_SEC; + dir->i_ctime = current_time(dir); } - dir->i_mtime = CURRENT_TIME_SEC; + dir->i_mtime = current_time(dir); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e5495f3..2472af2 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1624,20 +1624,12 @@ static void do_thread_regset_writeback(struct task_struct *task, regset->writeback(task, regset, 1); } -#ifndef PR_REG_SIZE -#define PR_REG_SIZE(S) sizeof(S) -#endif - #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S) sizeof(S) -#endif - -#ifndef PR_REG_PTR -#define PR_REG_PTR(S) (&((S)->pr_reg)) +#define PRSTATUS_SIZE(S, R) sizeof(S) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1645,6 +1637,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; + unsigned int regset_size = view->regsets[0].n * view->regsets[0].size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1653,12 +1646,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - (void) view->regsets[0].get(t->task, &view->regsets[0], - 0, PR_REG_SIZE(t->prstatus.pr_reg), - PR_REG_PTR(&t->prstatus), NULL); + (void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size, + &t->prstatus.pr_reg, NULL); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus), &t->prstatus); + PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1688,7 +1680,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset->core_note_type, size, data); else { - SET_PR_FPVALID(&t->prstatus, 1); + SET_PR_FPVALID(&t->prstatus, + 1, regset_size); fill_note(&t->notes[i], "CORE", NT_PRFPREG, size, data); } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 6103a63..9b4688a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -584,7 +584,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = - current_fs_time(inode->i_sb); + current_time(inode); } return inode; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 08ae993..05b5533 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -30,6 +30,7 @@ #include <linux/cleancache.h> #include <linux/dax.h> #include <linux/badblocks.h> +#include <linux/falloc.h> #include <asm/uaccess.h> #include "internal.h" @@ -180,9 +181,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); - if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, blkdev_get_block, - NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -302,14 +300,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); - if (error) { + if (error) bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } out: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; + return error; } EXPORT_SYMBOL(thaw_bdev); @@ -1275,7 +1270,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; @@ -1303,11 +1297,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; - } /* * If the device is invalidated, rescan partition @@ -1342,8 +1333,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; } } else { if (bdev->bd_contains == bdev) { @@ -1787,6 +1776,81 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping; + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages. */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, false); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + /* Only punch if the device can do zeroing discard. */ + if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + return -EOPNOTSUPP; + } + if (error) + return error; + + /* + * Invalidate again; if someone wandered in and dirtied a page, + * the caller will be given -EBUSY. The third argument is + * inclusive, so the rounding here is safe. + */ + return invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, @@ -1801,6 +1865,7 @@ const struct file_operations def_blk_fops = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 53bb7af..247b8df 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -79,11 +79,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) return ret; - if (ret == 0) - acl = NULL; } ret = 0; break; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ccc70d9..d4d8b7e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9d8edcb..0b8ce2b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3181,7 +3181,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 72a180d..3a14c87 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1782,7 +1782,7 @@ static void update_time_for_write(struct inode *inode) if (IS_NOCMTIME(inode)) return; - now = current_fs_time(inode->i_sb); + now = current_time(inode); if (!timespec_equal(&inode->i_mtime, &now)) inode->i_mtime = now; @@ -2065,7 +2065,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * flags for any errors that might have happened while doing * writeback of file data. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); inode_unlock(inode); goto out; } @@ -2603,7 +2603,7 @@ out_trans: goto out_free; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_mtime = inode->i_ctime = current_time(inode); trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -2867,7 +2867,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); i_size_write(inode, actual_end); btrfs_ordered_update_i_size(inode, actual_end, NULL); ret = btrfs_update_inode(trans, root, inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50ba4ca..2b790bd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4071,7 +4071,7 @@ err: inode_inc_iversion(inode); inode_inc_iversion(dir); inode->i_ctime = dir->i_mtime = - dir->i_ctime = current_fs_time(inode->i_sb); + dir->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4214,7 +4214,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + dir->i_mtime = dir->i_ctime = current_time(dir); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); @@ -4977,7 +4977,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) inode->i_ctime = inode->i_mtime = - current_fs_time(inode->i_sb); + current_time(inode); } if (newsize > oldsize) { @@ -5084,7 +5084,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (btrfs_root_readonly(root)) return -EROFS; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -5684,7 +5684,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_fs_time(inode->i_sb); + inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -6270,7 +6270,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = current_fs_time(inode->i_sb); + inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; @@ -6384,7 +6384,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, name_len * 2); inode_inc_iversion(parent_inode); parent_inode->i_mtime = parent_inode->i_ctime = - current_fs_time(parent_inode->i_sb); + current_time(parent_inode); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6602,7 +6602,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8465,7 +8465,8 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), + bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8633,7 +8634,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, goto out; /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) return 0; /* * Check to make sure we don't have duplicate iov_base's in this @@ -9508,7 +9509,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec ctime = CURRENT_TIME; + struct timespec ctime = current_time(old_inode); struct dentry *parent; u64 old_ino = btrfs_ino(old_inode); u64 new_ino = btrfs_ino(new_inode); @@ -9876,7 +9877,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_fs_time(old_dir->i_sb); + old_inode->i_ctime = current_time(old_dir); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); @@ -9901,7 +9902,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_fs_time(new_inode->i_sb); + new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(new_inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { root_objectid = BTRFS_I(new_inode)->location.objectid; @@ -10419,7 +10420,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && @@ -10559,21 +10560,6 @@ out_inode: } -/* Inspired by filemap_check_errors() */ -int btrfs_inode_check_errors(struct inode *inode) -{ - int ret = 0; - - if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && - test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) - ret = -ENOSPC; - if (test_bit(AS_EIO, &inode->i_mapping->flags) && - test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) - ret = -EIO; - - return ret; -} - static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10582,14 +10568,11 @@ static const struct inode_operations btrfs_dir_inode_operations = { .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, - .rename2 = btrfs_rename2, + .rename = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = generic_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, @@ -10663,10 +10646,7 @@ static const struct address_space_operations btrfs_symlink_aops = { static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = generic_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, @@ -10677,10 +10657,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = generic_removexattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, @@ -10691,10 +10668,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = generic_removexattr, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index af69129..18e1aa0f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -349,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_update_iflags(inode); inode_inc_iversion(inode); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -445,7 +445,7 @@ static noinline int create_subvol(struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; - struct timespec cur_time = current_fs_time(dir->i_sb); + struct timespec cur_time = current_time(dir); struct inode *inode; int ret; int err; @@ -3292,7 +3292,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_mtime = inode->i_ctime = current_time(inode); /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -5107,7 +5107,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; - struct timespec ct = current_fs_time(inode->i_sb); + struct timespec ct = current_time(inode); int ret = 0; int received_uuid_changed; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0b46289..71261b4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4335,7 +4335,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - posix_acl_xattr_header dummy_acl; + struct posix_acl_xattr_header dummy_acl; p = fs_path_alloc(); if (!p) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e66a18e..9517de0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1470,7 +1470,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root, 0); - cur_time = current_fs_time(parent_inode->i_sb); + cur_time = current_time(parent_inode); /* * insert the directory item @@ -1626,7 +1626,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = - current_fs_time(parent_inode->i_sb); + current_time(parent_inode); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4b1c0a6..3d33c4e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3953,7 +3953,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, * i_mapping flags, so that the next fsync won't get * an outdated io error too. */ - btrfs_inode_check_errors(inode); + filemap_check_errors(inode->i_mapping); *ordered_io_error = true; break; } @@ -4190,7 +4190,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * without writing to the log tree and the fsync must report the * file data write error and not commit the current transaction. */ - ret = btrfs_inode_check_errors(inode); + ret = filemap_check_errors(inode->i_mapping); if (ret) ctx->io_err = ret; process: diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index d1a177a..fccbf55 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -252,7 +252,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, goto out; inode_inc_iversion(inode); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/buffer.c b/fs/buffer.c index 9c8eb9b..b205a62 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -351,7 +351,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); @@ -1078,7 +1078,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -struct buffer_head * +static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { @@ -1109,7 +1109,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, free_more_memory(); } } -EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -3250,7 +3249,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { if (buffer_write_io_error(bh) && page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 6af790f..3ff867f 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -20,6 +20,7 @@ #include <linux/mount.h> #include <linux/statfs.h> #include <linux/ctype.h> +#include <linux/xattr.h> #include "internal.h" static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches); @@ -126,8 +127,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) if (d_is_negative(root) || !d_backing_inode(root)->i_op->lookup || !d_backing_inode(root)->i_op->mkdir || - !d_backing_inode(root)->i_op->setxattr || - !d_backing_inode(root)->i_op->getxattr || + !(d_backing_inode(root)->i_opflags & IOP_XATTR) || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs) goto error_unsupported; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index ce5f345..e7f16a7 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) struct cachefiles_object *object; struct cachefiles_cache *cache; const struct cred *saved_cred; + struct inode *inode; + blkcnt_t i_blocks = 0; ASSERT(_object); @@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object) _object != cache->cache.fsdef ) { _debug("- retire object OBJ%x", object->fscache.debug_id); + inode = d_backing_inode(object->dentry); + if (inode) + i_blocks = inode->i_blocks; + cachefiles_begin_secure(cache, &saved_cred); cachefiles_delete_object(cache, object); cachefiles_end_secure(cache, saved_cred); @@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive(cache, object, i_blocks); dput(object->dentry); object->dentry = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2fcde1a..cd1effe 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); * namei.c */ extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object); + struct cachefiles_object *object, + blkcnt_t i_blocks); extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); extern int cachefiles_walk_to_object(struct cachefiles_object *parent, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 3f7c2cd..41df8a2 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -20,6 +20,7 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/slab.h> +#include <linux/xattr.h> #include "internal.h" #define CACHEFILES_KEYBUF_SIZE 512 @@ -261,10 +262,9 @@ requeue: * Mark an object as being inactive. */ void cachefiles_mark_object_inactive(struct cachefiles_cache *cache, - struct cachefiles_object *object) + struct cachefiles_object *object, + blkcnt_t i_blocks) { - blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks; - write_lock(&cache->active_lock); rb_erase(&object->active_node, &cache->active_nodes); clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); @@ -707,7 +707,8 @@ mark_active_timed_out: check_error: _debug("check error %d", ret); - cachefiles_mark_object_inactive(cache, object); + cachefiles_mark_object_inactive( + cache, object, d_backing_inode(object->dentry)->i_blocks); release_dentry: dput(object->dentry); object->dentry = NULL; @@ -799,13 +800,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!d_backing_inode(subdir)->i_op->setxattr || - !d_backing_inode(subdir)->i_op->getxattr || + if (!(d_backing_inode(subdir)->i_opflags & IOP_XATTR) || !d_backing_inode(subdir)->i_op->lookup || !d_backing_inode(subdir)->i_op->mkdir || !d_backing_inode(subdir)->i_op->create || - (!d_backing_inode(subdir)->i_op->rename && - !d_backing_inode(subdir)->i_op->rename2) || + !d_backing_inode(subdir)->i_op->rename || !d_backing_inode(subdir)->i_op->rmdir || !d_backing_inode(subdir)->i_op->unlink) goto check_error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 4f67227..987044b 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -95,11 +95,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &new_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &new_mode, &acl); + if (ret) goto out; - if (ret == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: @@ -127,6 +125,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out_free; } + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto out_free; + } + if (new_mode != old_mode) { newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d5b6f95..ef3ebd7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -175,9 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, static int ceph_releasepage(struct page *page, gfp_t g) { - dout("%p releasepage %p idx %lu\n", page->mapping->host, - page, page->index); - WARN_ON(PageDirty(page)); + dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, + page, page->index, PageDirty(page) ? "" : "not "); /* Can we release the page from the cache? */ if (!ceph_release_fscache_page(page, g)) @@ -298,14 +297,6 @@ unlock: kfree(osd_data->pages); } -static void ceph_unlock_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - unlock_page(pages[i]); -} - /* * start an async read(ahead) operation. return nr_pages we submitted * a read for on success, or negative error code. @@ -370,6 +361,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p add_to_page_cache failed %p\n", inode, page); nr_pages = i; + if (nr_pages > 0) { + len = nr_pages << PAGE_SHIFT; + break; + } goto out_pages; } pages[i] = page; @@ -386,8 +381,11 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) return nr_pages; out_pages: - ceph_unlock_page_vector(pages, nr_pages); - ceph_release_page_vector(pages, nr_pages); + for (i = 0; i < nr_pages; ++i) { + ceph_fscache_readpage_cancel(inode, pages[i]); + unlock_page(pages[i]); + } + ceph_put_page_vector(pages, nr_pages, false); out: ceph_osdc_put_request(req); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index df4b3e6..78180d1 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1061,7 +1061,8 @@ out: } static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1069,6 +1070,9 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, int op = CEPH_MDS_OP_RENAME; int err; + if (flags) + return -EINVAL; + if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; if (ceph_snap(old_dir) != CEPH_NOSNAP) { @@ -1486,10 +1490,7 @@ const struct inode_operations ceph_dir_iops = { .permission = ceph_permission, .getattr = ceph_getattr, .setattr = ceph_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = generic_removexattr, .get_acl = ceph_get_acl, .set_acl = ceph_set_acl, .mknod = ceph_mknod, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0f5375d..18630e8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -886,7 +886,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int num_pages = 0; int flags; int ret; - struct timespec mtime = current_fs_time(inode->i_sb); + struct timespec mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; @@ -902,10 +902,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, return ret; if (write) { - ret = invalidate_inode_pages2_range(inode->i_mapping, + int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (pos + count) >> PAGE_SHIFT); - if (ret < 0) + if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | @@ -1091,7 +1091,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, int flags; int check_caps = 0; int ret; - struct timespec mtime = current_fs_time(inode->i_sb); + struct timespec mtime = current_time(inode); size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -1272,7 +1272,8 @@ again: statret = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, !!page); if (statret < 0) { - __free_page(page); + if (page) + __free_page(page); if (statret == -ENODATA) { BUG_ON(retry_op != READ_INLINE); goto again; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dd3a6db..ef4d046 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -94,10 +94,7 @@ const struct inode_operations ceph_file_iops = { .permission = ceph_permission, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = generic_removexattr, .get_acl = ceph_get_acl, .set_acl = ceph_set_acl, }; @@ -1514,7 +1511,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && + !(rinfo->hash_order && req->r_path2)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); @@ -1885,10 +1883,7 @@ static const struct inode_operations ceph_symlink_iops = { .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ceph_listxattr, - .removexattr = generic_removexattr, }; int __ceph_setattr(struct inode *inode, struct iattr *attr) @@ -1905,13 +1900,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int inode_dirty_flags = 0; bool lock_snap_rwsem = false; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - - err = inode_change_ok(inode, attr); - if (err != 0) - return err; - prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2080,7 +2068,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); } release &= issued; @@ -2124,7 +2112,17 @@ out_put: */ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { - return __ceph_setattr(d_inode(dentry), attr); + struct inode *inode = d_inode(dentry); + int err; + + if (ceph_snap(inode) != CEPH_NOSNAP) + return -EROFS; + + err = setattr_prepare(dentry, attr); + if (err != 0) + return err; + + return __ceph_setattr(inode, attr); } /* diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index a2cb0c2..6806dbe 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -210,8 +210,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; /* No mandatory locks */ - if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) - return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; dout("ceph_flock, fl_file: %p", fl->fl_file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f72d4ae..815acd1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -370,6 +370,7 @@ const char *ceph_session_state_name(int s) case CEPH_MDS_SESSION_CLOSING: return "closing"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + case CEPH_MDS_SESSION_REJECTED: return "rejected"; default: return "???"; } } @@ -1150,8 +1151,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - list_add(&cf->i_list, &to_remove); + list_move(&cf->i_list, &to_remove); } spin_lock(&mdsc->cap_dirty_lock); @@ -1378,7 +1378,7 @@ static int request_close_session(struct ceph_mds_client *mdsc, if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); - return 0; + return 1; } /* @@ -2131,6 +2131,10 @@ static int __do_request(struct ceph_mds_client *mdsc, ceph_session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + err = -EACCES; + goto out_session; + } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) __open_session(mdsc, session); @@ -2652,6 +2656,15 @@ static void handle_session(struct ceph_mds_session *session, wake_up_session_caps(session, 0); break; + case CEPH_SESSION_REJECT: + WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); + pr_info("mds%d rejected session\n", session->s_mds); + session->s_state = CEPH_MDS_SESSION_REJECTED; + cleanup_session_requests(mdsc, session); + remove_session_caps(session); + wake = 2; /* for good measure */ + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -3557,11 +3570,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) /* * true if all sessions are closed, or we force unmount */ -static bool done_closing_sessions(struct ceph_mds_client *mdsc) +static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; - return atomic_read(&mdsc->num_sessions) == 0; + return atomic_read(&mdsc->num_sessions) <= skipped; } /* @@ -3572,6 +3585,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; + int skipped = 0; dout("close_sessions\n"); @@ -3583,7 +3597,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - __close_session(mdsc, session); + if (__close_session(mdsc, session) <= 0) + skipped++; mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); @@ -3591,7 +3606,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); dout("waiting for sessions to close\n"); - wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + wait_event_timeout(mdsc->session_close_wq, + done_closing_sessions(mdsc, skipped), ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 6b36797..3c6f77b 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -121,6 +121,7 @@ enum { CEPH_MDS_SESSION_CLOSING = 5, CEPH_MDS_SESSION_RESTARTING = 6, CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_REJECTED = 8, }; struct ceph_mds_session { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89e6bc3..913dea1 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -43,6 +43,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RECALL_STATE: return "recall_state"; case CEPH_SESSION_FLUSHMSG: return "flushmsg"; case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; + case CEPH_SESSION_FORCE_RO: return "force_ro"; + case CEPH_SESSION_REJECT: return "reject"; } return "???"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e247f6f..b382e59 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -396,10 +396,12 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) { - err = -ENOMEM; - goto out; + if (strlen(dev_name_end) > 1) { + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } } else { dev_name_end = dev_name + strlen(dev_name); @@ -788,15 +790,10 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; dout("open_root_inode success\n"); - if (ceph_ino(inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) { - root = d_make_root(inode); - if (!root) { - root = ERR_PTR(-ENOMEM); - goto out; - } - } else { - root = d_obtain_root(inode); + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; } ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); @@ -825,35 +822,31 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { + const char *path; err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - dout("mount opening root\n"); - root = open_root_dentry(fsc, "", started); + if (!fsc->mount_options->server_path) { + path = ""; + dout("mount opening path \\t\n"); + } else { + path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); + } + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - fsc->sb->s_root = root; + fsc->sb->s_root = dget(root); first = 1; err = ceph_fs_debugfs_init(fsc); if (err < 0) goto fail; - } - - if (!fsc->mount_options->server_path) { - root = fsc->sb->s_root; - dget(root); } else { - const char *path = fsc->mount_options->server_path + 1; - dout("mount opening path %s\n", path); - root = open_root_dentry(fsc, path, started); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } + root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index adc2318..febc28f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -16,7 +16,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); -const struct xattr_handler ceph_other_xattr_handler; +static const struct xattr_handler ceph_other_xattr_handler; /* * List of handlers for synthetic system.* attributes. Other @@ -1034,7 +1034,7 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, &prealloc_cf); ci->i_xattrs.dirty = true; - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); } spin_unlock(&ci->i_ceph_lock); @@ -1086,7 +1086,7 @@ static int ceph_set_xattr_handler(const struct xattr_handler *handler, return __ceph_setxattr(inode, name, value, size, flags); } -const struct xattr_handler ceph_other_xattr_handler = { +static const struct xattr_handler ceph_other_xattr_handler = { .prefix = "", /* match any name => handlers called with full name */ .get = ceph_get_xattr_handler, .set = ceph_set_xattr_handler, diff --git a/fs/char_dev.c b/fs/char_dev.c index 6edd825..44a240c 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -406,6 +406,7 @@ void cd_forget(struct inode *inode) spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; + inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6c58e13..3d03e48 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 1418daa..07ed81c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -49,6 +49,7 @@ #define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible * root mountable */ +#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 0065256..57ff075 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -36,7 +36,15 @@ struct smb_mnt_fs_info { __u64 cifs_posix_caps; } __packed; +struct smb_snapshot_array { + __u32 number_of_snapshots; + __u32 number_of_snapshots_returned; + __u32 snapshot_array_size; + /* snapshots[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) +#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 71e8a56..15bac39 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,6 +42,35 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; +/* S-1-22-1 Unmapped Unix users */ +static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-22-2 Unmapped Unix groups */ +static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ + +/* S-1-5-88 MS NFS and Apple style UID/GID/mode */ + +/* S-1-5-88-1 Unix uid */ +static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-2 Unix gid */ +static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-3 Unix mode */ +static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + static const struct cred *root_cred; static int @@ -183,6 +212,62 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) return 0; /* sids compare/match */ } +static bool +is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) +{ + int i; + int num_subauth; + const struct cifs_sid *pwell_known_sid; + + if (!psid || (puid == NULL)) + return false; + + num_subauth = psid->num_subauth; + + /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */ + if (num_subauth == 2) { + if (is_group) + pwell_known_sid = &sid_unix_groups; + else + pwell_known_sid = &sid_unix_users; + } else if (num_subauth == 3) { + if (is_group) + pwell_known_sid = &sid_unix_NFS_groups; + else + pwell_known_sid = &sid_unix_NFS_users; + } else + return false; + + /* compare the revision */ + if (psid->revision != pwell_known_sid->revision) + return false; + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (psid->authority[i] != pwell_known_sid->authority[i]) { + cifs_dbg(FYI, "auth %d did not match\n", i); + return false; + } + } + + if (num_subauth == 2) { + if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) + return false; + + *puid = le32_to_cpu(psid->sub_auth[1]); + } else /* 3 subauths, ie Windows/Mac style */ { + *puid = le32_to_cpu(psid->sub_auth[0]); + if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) || + (psid->sub_auth[1] != pwell_known_sid->sub_auth[1])) + return false; + + *puid = le32_to_cpu(psid->sub_auth[2]); + } + + cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid); + return true; /* well known sid found, uid returned */ +} + static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { @@ -276,6 +361,43 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + uint32_t unix_id; + bool is_group; + + if (sidtype != SIDOWNER) + is_group = true; + else + is_group = false; + + if (is_well_known_sid(psid, &unix_id, is_group) == false) + goto try_upcall_to_get_id; + + if (is_group) { + kgid_t gid; + gid_t id; + + id = (gid_t)unix_id; + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) { + fgid = gid; + goto got_valid_id; + } + } else { + kuid_t uid; + uid_t id; + + id = (uid_t)unix_id; + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) { + fuid = uid; + goto got_valid_id; + } + } + /* If unable to find uid/gid easily from SID try via upcall */ + } + +try_upcall_to_get_id: sidstr = sid_to_key_str(psid, sidtype); if (!sidstr) return -ENOMEM; @@ -329,6 +451,7 @@ out_revert_creds: * Note that we return 0 here unconditionally. If the mapping * fails then we just fall back to using the mnt_uid/mnt_gid. */ +got_valid_id: if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 14ae4b8..15261ba 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,15 +64,15 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, uint, 0); +module_param(CIFSMaxBufSize, uint, 0444); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, uint, 0); +module_param(cifs_min_rcv, uint, 0444); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, uint, 0); +module_param(cifs_min_small, uint, 0444); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; @@ -271,7 +271,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->createtime = 0; cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(cifs_inode->lease_key); #endif /* * Can not set i_flags here - they get immediately overwritten to zero @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) seq_puts(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + seq_puts(s, ",idsfromsid"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -896,35 +898,26 @@ const struct inode_operations cifs_dir_inode_ops = { .link = cifs_hardlink, .mkdir = cifs_mkdir, .rmdir = cifs_rmdir, - .rename2 = cifs_rename2, + .rename = cifs_rename2, .permission = cifs_permission, .setattr = cifs_setattr, .symlink = cifs_symlink, .mknod = cifs_mknod, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = generic_removexattr, }; const struct inode_operations cifs_file_inode_ops = { .setattr = cifs_setattr, .getattr = cifs_getattr, .permission = cifs_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = generic_removexattr, }; const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, .get_link = cifs_get_link, .permission = cifs_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = cifs_listxattr, - .removexattr = generic_removexattr, }; static int cifs_clone_file_range(struct file *src_file, loff_t off, @@ -1271,7 +1264,6 @@ init_cifs(void) GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; spin_lock_init(&cifs_tcp_ses_lock); - spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9dcf974..c9c00a8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,16 @@ cifs_uniqueid_to_ino_t(u64 fileid) } +static inline void cifs_set_time(struct dentry *dentry, unsigned long time) +{ + dentry->d_fsdata = (void *) time; +} + +static inline unsigned long cifs_get_time(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8f1d8c1..1f17f6b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -75,6 +75,18 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 +/* + * Default number of credits to keep available for SMB3. + * This value is chosen somewhat arbitrarily. The Windows client + * defaults to 128 credits, the Windows server allows clients up to + * 512 credits (or 8K for later versions), and the NetApp server + * does not limit clients at all. Choose a high enough default value + * such that the client shouldn't limit performance, but allow mount + * to override (until you approach 64K, where we limit credits to 65000 + * to reduce possibility of seeing more server credit overflow bugs. + */ +#define SMB2_MAX_CREDITS_AVAILABLE 32000 + #include "cifspdu.h" #ifndef XATTR_DOS_ATTRIB @@ -376,6 +388,8 @@ struct smb_version_operations { int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, struct cifsFileInfo *src_file); + int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file, void __user *); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -464,6 +478,7 @@ struct smb_vol { bool retry:1; bool intr:1; bool setuids:1; + bool setuidfromacl:1; bool override_uid:1; bool override_gid:1; bool dynperm:1; @@ -510,6 +525,7 @@ struct smb_vol { struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ + unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -567,7 +583,8 @@ struct TCP_Server_Info { bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; - int credits; /* send no more requests at once */ + unsigned int credits; /* send no more requests at once */ + unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; @@ -833,6 +850,7 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head openFileList; + spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; @@ -889,7 +907,7 @@ struct cifs_tcon { #endif /* CONFIG_CIFS_STATS2 */ __u64 bytes_read; __u64 bytes_written; - spinlock_t stat_lock; + spinlock_t stat_lock; /* protects the two fields above */ #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ @@ -1040,20 +1058,24 @@ struct cifs_fid_locks { }; struct cifsFileInfo { + /* following two lists are protected by tcon->open_file_lock */ struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ + /* lock list below protected by cifsi->lock_sem */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ + struct list_head rlist; /* reconnect list */ /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; - unsigned int f_flags; struct tcon_link *tlink; + unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - int count; /* refcount protected by cifs_file_list_lock */ + int count; + spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ @@ -1120,7 +1142,7 @@ struct cifs_writedata { /* * Take a reference on the file private data. Must be called with - * cifs_file_list_lock held. + * cfile->file_info_lock held. */ static inline void cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) @@ -1514,8 +1536,10 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * cifs_file_list_lock protects: - * list operations on tcp and SMB session lists and tCon lists + * tcp_ses_lock protects: + * list operations on tcp and SMB session lists + * tcon->open_file_lock protects the list of open files hanging off the tcon + * cfile->file_info_lock protects counters and fields in cifs file struct * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations * @@ -1547,18 +1571,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcp session, and the list of tcon's per smb session. It also protects * the reference counters for the server, smb session, and tcon. Finally, * changes to the tcon->tidStatus should be done while holding this lock. + * generally the locks should be taken in order tcp_ses_lock before + * tcon->open_file_lock and that before file->file_info_lock since the + * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -/* - * This lock protects the cifs_file->llist and cifs_file->flist - * list operations, and updates to some flags (cifs_file->invalidHandle) - * It will be moved to either use the tcon->stat_lock or equivalent later. - * If cifs_tcp_ses_lock and the lock below are both needed to be held, then - * the cifs_tcp_ses_lock must be grabbed first and released last. - */ -GLOBAL_EXTERN spinlock_t cifs_file_list_lock; - #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 95dab43..ced0e42 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -193,6 +193,8 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); + extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, @@ -392,8 +394,7 @@ extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *return_buf_type); extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op); + unsigned int *nbytes, const char *buf); extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, const int nvec); extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d47197e..3f3185f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp1; /* list all files open on tree connection and mark them invalid */ - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; open_file->oplock_break_cancelled = true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -1228,7 +1228,6 @@ OldOpenRetry: inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); - /* long_op set to 1 to allow for oplock break timeouts */ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *)pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); @@ -1768,8 +1767,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms, int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, - unsigned int *nbytes, const char *buf, - const char __user *ubuf, const int long_op) + unsigned int *nbytes, const char *buf) { int rc = -EACCES; WRITE_REQ *pSMB = NULL; @@ -1838,12 +1836,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4); if (buf) memcpy(pSMB->Data, buf, bytes_sent); - else if (ubuf) { - if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) { - cifs_buf_release(pSMB); - return -EFAULT; - } - } else if (count != 0) { + else if (count != 0) { /* No buffer */ cifs_buf_release(pSMB); return -EINVAL; @@ -1867,7 +1860,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms, } rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, long_op); + (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); if (rc) { cifs_dbg(FYI, "Send error in write = %d\n", rc); @@ -3334,7 +3327,7 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, #ifdef CONFIG_CIFS_POSIX /*Convert an Access Control Entry from wire format to local POSIX xattr format*/ -static void cifs_convert_ace(posix_acl_xattr_entry *ace, +static void cifs_convert_ace(struct posix_acl_xattr_entry *ace, struct cifs_posix_ace *cifs_ace) { /* u8 cifs fields do not need le conversion */ @@ -3358,7 +3351,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, __u16 count; struct cifs_posix_ace *pACE; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt; + struct posix_acl_xattr_header *local_acl = (void *)trgt; if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION) return -EOPNOTSUPP; @@ -3396,9 +3389,11 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } else if (size > buflen) { return -ERANGE; } else /* buffer big enough */ { + struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1); + local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (i = 0; i < count ; i++) { - cifs_convert_ace(&local_acl->a_entries[i], pACE); + cifs_convert_ace(&ace[i], pACE); pACE++; } } @@ -3406,7 +3401,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen, } static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace, - const posix_acl_xattr_entry *local_ace) + const struct posix_acl_xattr_entry *local_ace) { __u16 rc = 0; /* 0 = ACL converted ok */ @@ -3431,7 +3426,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, { __u16 rc = 0; struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data; - posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL; + struct posix_acl_xattr_header *local_acl = (void *)pACL; int count; int i; @@ -3459,7 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, } for (i = 0; i < count; i++) { rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], - &local_acl->a_entries[i]); + (struct posix_acl_xattr_entry *)(local_acl + 1)); if (rc != 0) { /* ACE not converted */ break; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e4f4ba..aab5227 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -63,7 +63,6 @@ extern mempool_t *cifs_req_poolp; #define TLINK_IDLE_EXPIRE (600 * HZ) enum { - /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, @@ -76,7 +75,7 @@ enum { Opt_noposixpaths, Opt_nounix, Opt_nocase, Opt_brl, Opt_nobrl, - Opt_forcemandatorylock, Opt_setuids, + Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, Opt_nointr, Opt_intr, @@ -95,7 +94,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, + Opt_echo_interval, Opt_max_credits, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -148,6 +147,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_forcemandatorylock, "forcemand" }, { Opt_setuids, "setuids" }, { Opt_nosetuids, "nosetuids" }, + { Opt_setuidfromacl, "idsfromsid" }, { Opt_dynperm, "dynperm" }, { Opt_nodynperm, "nodynperm" }, { Opt_nohard, "nohard" }, @@ -190,6 +190,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, { Opt_echo_interval, "echo_interval=%s" }, + { Opt_max_credits, "max_credits=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -1376,6 +1377,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nosetuids: vol->setuids = 0; break; + case Opt_setuidfromacl: + vol->setuidfromacl = 1; + break; case Opt_dynperm: vol->dynperm = true; break; @@ -1586,6 +1590,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->echo_interval = option; break; + case Opt_max_credits: + if (get_option_ul(args, &option) || (option < 20) || + (option > 60000)) { + cifs_dbg(VFS, "%s: Invalid max_credits value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->max_credits = option; + break; /* String Arguments */ @@ -2163,7 +2176,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); + generate_random_uuid(tcp_ses->client_guid); #endif /* * at this point we are the only ones with the pointer @@ -3270,6 +3283,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->setuidfromacl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; if (pvolume_info->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; if (pvolume_info->remap) @@ -3598,7 +3613,11 @@ try_mount_again: bdi_destroy(&cifs_sb->bdi); goto out; } - + if ((volume_info->max_credits < 20) || + (volume_info->max_credits > 60000)) + server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + server->max_credits = volume_info->max_credits; /* get a reference to a SMB session */ ses = cifs_get_smb_ses(server, volume_info); if (IS_ERR(ses)) { @@ -3688,14 +3707,16 @@ remote_path_check: goto mount_fail_check; } - rc = cifs_are_all_path_components_accessible(server, + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path); - if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - rc = 0; + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } } kfree(full_path); } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4716c54..789ff1d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -40,7 +40,7 @@ renew_parental_timestamps(struct dentry *direntry) /* BB check if there is a way to get the kernel to do this or if we really need this */ do { - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); direntry = direntry->d_parent; } while (!IS_ROOT(direntry)); } @@ -802,7 +802,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } else if (rc == -ENOENT) { rc = 0; - direntry->d_time = jiffies; + cifs_set_time(direntry, jiffies); d_add(direntry, NULL); /* if it was once a directory (but how can we tell?) we could do shrink_dcache_parent(direntry); */ @@ -862,7 +862,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; - if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) + if (time_after(jiffies, cifs_get_time(direntry) + HZ) || !lookupCacheEnabled) return 0; return 1; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 579e41b..7f5f617 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + spin_lock_init(&cfile->file_info_lock); cifs_sb_active(inode->i_sb); @@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); @@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) list_add(&cfile->flist, &cinode->openFileList); else list_add_tail(&cfile->flist, &cinode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (fid->purge_cache) cifs_zap_mapping(inode); @@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - spin_lock(&cifs_file_list_lock); + spin_lock(&cifs_file->file_info_lock); cifsFileInfo_get_locked(cifs_file); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); return cifs_file; } /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding - * cifs_file_list_lock. + * tcon->open_file_lock and cifs_file->file_info_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { @@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_pending_open open; bool oplock_break_cancelled; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); + + spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&tcon->open_file_lock); return; } + spin_unlock(&cifs_file->file_info_lock); if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } - spin_unlock(&cifs_file_list_lock); + + spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); @@ -732,6 +739,15 @@ reopen_success: * to the server to get the new inode info. */ + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); + oplock = 0; + } + server->ops->set_fid(cfile, &cfile->fid, oplock); if (oparms.reconnect) cifs_relock_file(cfile); @@ -753,6 +769,36 @@ int cifs_close(struct inode *inode, struct file *file) return 0; } +void +cifs_reopen_persistent_handles(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file; + struct list_head *tmp; + struct list_head *tmp1; + struct list_head tmp_list; + + cifs_dbg(FYI, "Reopen persistent handles"); + INIT_LIST_HEAD(&tmp_list); + + /* list all files open on tree connection, reopen resilient handles */ + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + if (!open_file->invalidHandle) + continue; + cifsFileInfo_get(open_file); + list_add_tail(&open_file->rlist, &tmp_list); + } + spin_unlock(&tcon->open_file_lock); + + list_for_each_safe(tmp, tmp1, &tmp_list) { + open_file = list_entry(tmp, struct cifsFileInfo, rlist); + cifs_reopen_file(open_file, false /* do not flush */); + list_del_init(&open_file->rlist); + cifsFileInfo_put(open_file); + } +} + int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; @@ -772,10 +818,10 @@ int cifs_closedir(struct inode *inode, struct file *file) server = tcon->ses->server; cifs_dbg(FYI, "Freeing private data in close dir\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else @@ -784,7 +830,7 @@ int cifs_closedir(struct inode *inode, struct file *file) /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { @@ -1728,12 +1774,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1744,8 +1791,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1753,7 +1800,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } @@ -1762,6 +1809,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; bool any_available = false; int rc; unsigned int refind = 0; @@ -1777,15 +1825,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1796,8 +1845,8 @@ refind_writable: if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } else { if (!inv_file) @@ -1813,24 +1862,24 @@ refind_writable: if (inv_file) { any_available = false; - cifsFileInfo_get_locked(inv_file); + cifsFileInfo_get(inv_file); } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); if (!rc) return inv_file; else { - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); cifsFileInfo_put(inv_file); - spin_lock(&cifs_file_list_lock); ++refind; inv_file = NULL; + spin_lock(&tcon->open_file_lock); goto refind_writable; } } @@ -1878,7 +1927,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data, to - from, &offset); cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ - inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); + inode->i_atime = inode->i_mtime = current_time(inode); if ((bytes_written > 0) && (offset)) rc = 0; else if (bytes_written < 0) @@ -2478,7 +2527,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, size_t cur_len; unsigned long nr_pages, num_pages, i; struct cifs_writedata *wdata; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; @@ -2489,7 +2538,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, pid = current->tgid; server = tlink_tcon(open_file->tlink)->ses->server; - memcpy(&saved_from, from, sizeof(struct iov_iter)); do { unsigned int wsize, credits; @@ -2551,8 +2599,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, kref_put(&wdata->refcount, cifs_uncached_writedata_release); if (rc == -EAGAIN) { - memcpy(from, &saved_from, - sizeof(struct iov_iter)); + *from = saved_from; iov_iter_advance(from, offset - saved_offset); continue; } @@ -2576,7 +2623,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) struct cifs_sb_info *cifs_sb; struct cifs_writedata *wdata, *tmp; struct list_head wdata_list; - struct iov_iter saved_from; + struct iov_iter saved_from = *from; int rc; /* @@ -2597,8 +2644,6 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) if (!tcon->ses->server->ops->async_writev) return -ENOSYS; - memcpy(&saved_from, from, sizeof(struct iov_iter)); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, open_file, cifs_sb, &wdata_list); @@ -2631,13 +2676,11 @@ restart_loop: /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from; + struct iov_iter tmp_from = saved_from; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); - memcpy(&tmp_from, &saved_from, - sizeof(struct iov_iter)); iov_iter_advance(&tmp_from, wdata->offset - iocb->ki_pos); @@ -3571,7 +3614,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, cifs_dbg(FYI, "Bytes read %d\n", rc); file_inode(file)->i_atime = - current_fs_time(file_inode(file)->i_sb); + current_time(file_inode(file)); if (PAGE_SIZE > rc) memset(read_data + rc, 0, PAGE_SIZE - rc); @@ -3618,15 +3661,17 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + struct cifs_tcon *tcon = + cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 1; } } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 0; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b87efd0..7ab5be7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1951,7 +1951,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", full_path, inode, inode->i_count.counter, - dentry, dentry->d_time, jiffies); + dentry, cifs_get_time(dentry), jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); @@ -2154,7 +2154,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = inode_change_ok(inode, attrs); + rc = setattr_prepare(direntry, attrs); if (rc < 0) goto out; @@ -2294,7 +2294,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = inode_change_ok(inode, attrs); + rc = setattr_prepare(direntry, attrs); if (rc < 0) { free_xid(xid); return rc; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7a3b84e..9f51b81 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); - + cifs_dbg(VFS, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: if (pSMBFile == NULL) @@ -267,11 +267,23 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; + case CIFS_ENUMERATE_SNAPSHOTS: + if (arg == 0) { + rc = -EINVAL; + goto cifs_ioc_exit; + } + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->enum_snapshots) + rc = tcon->ses->server->ops->enum_snapshots(xid, tcon, + pSMBFile, (void __user *)arg); + else + rc = -EOPNOTSUPP; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; } - +cifs_ioc_exit: free_xid(xid); return rc; } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 062c237..d031af8 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -399,7 +399,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, io_parms.offset = 0; io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; - rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0); + rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf); CIFSSMBClose(xid, tcon, fid.netfid); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 813fe13..c672915 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -120,6 +120,7 @@ tconInfoAlloc(void) ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) &netfile->oplock_break); netfile->oplock_break_cancelled = false; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb) void cifs_del_pending_open(struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(open->tlink)->open_file_lock); list_del(&open->olist); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } void @@ -635,7 +636,7 @@ void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(tlink)->open_file_lock); cifs_add_pending_open_locked(fid, tlink, open); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 65cf85d..8f6a2a5 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -597,14 +597,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) server->ops->close_dir(xid, tcon, &cfile->fid); } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (cfile->srch_inf.ntwrk_buf_start) { cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 4f0231e..1238cd3 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -266,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; int rc; + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime) && + (buf->Attributes == 0)) + return 0; /* would be a no op, no sense sending this */ + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, SMB2_OP_SET_INFO); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 389fb9f..3d38348 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -549,19 +549,19 @@ smb2_is_valid_lease_break(char *buffer) list_for_each(tmp1, &server->smb_ses_list) { ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - spin_lock(&cifs_file_list_lock); list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); if (smb2_tcon_has_lease(tcon, rsp, lw)) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + spin_unlock(&tcon->open_file_lock); } - spin_unlock(&cifs_file_list_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -603,7 +603,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -615,7 +615,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(d_inode(cfile->dentry)); - + spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; @@ -637,14 +637,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) clear_bit( CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - + spin_unlock(&cfile->file_info_lock); queue_work(cifsiod_wq, &cfile->oplock_break); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d203c03..5d456eb 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -28,6 +28,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "cifs_ioctl.h" static int change_conf(struct TCP_Server_Info *server) @@ -70,6 +71,10 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; + if (*val > 65000) { + *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ + printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); + } server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) rc = change_conf(server); @@ -287,7 +292,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); } - + kfree(out_buf); return rc; } #endif /* STATS2 */ @@ -541,6 +546,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); + memcpy(cfile->fid.create_guid, fid->create_guid, 16); } static void @@ -699,6 +705,7 @@ smb2_clone_range(const unsigned int xid, cchunk_out: kfree(pcchunk); + kfree(retbuf); return rc; } @@ -823,7 +830,6 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; - char *retbuf = NULL; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -849,7 +855,7 @@ smb2_duplicate_extents(const unsigned int xid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - (char **)&retbuf, + NULL, &ret_data_len); if (ret_data_len > 0) @@ -872,7 +878,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { struct fsctl_set_integrity_information_req integr_info; - char *retbuf = NULL; unsigned int ret_data_len; integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); @@ -884,9 +889,53 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, FSCTL_SET_INTEGRITY_INFORMATION, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), + NULL, + &ret_data_len); + +} + +static int +smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, void __user *ioc_buf) +{ + char *retbuf = NULL; + unsigned int ret_data_len = 0; + int rc; + struct smb_snapshot_array snapshot_in; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SRV_ENUMERATE_SNAPSHOTS, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); + cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", + rc, ret_data_len); + if (rc) + return rc; + if (ret_data_len && (ioc_buf != NULL) && (retbuf != NULL)) { + /* Fixup buffer */ + if (copy_from_user(&snapshot_in, ioc_buf, + sizeof(struct smb_snapshot_array))) { + rc = -EFAULT; + kfree(retbuf); + return rc; + } + if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { + rc = -ERANGE; + return rc; + } + + if (ret_data_len > snapshot_in.snapshot_array_size) + ret_data_len = snapshot_in.snapshot_array_size; + + if (copy_to_user(ioc_buf, retbuf, ret_data_len)) + rc = -EFAULT; + } + + kfree(retbuf); + return rc; } static int @@ -1041,7 +1090,7 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) static void smb2_new_lease_key(struct cifs_fid *fid) { - get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(fid->lease_key); } #define SMB2_SYMLINK_STRUCT_SIZE \ @@ -1654,6 +1703,7 @@ struct smb_version_operations smb21_operations = { .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .enum_snapshots = smb3_enum_snapshots, }; struct smb_version_operations smb30_operations = { @@ -1740,6 +1790,7 @@ struct smb_version_operations smb30_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #ifdef CONFIG_CIFS_SMB311 @@ -1827,6 +1878,7 @@ struct smb_version_operations smb311_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 29e06db..5ca5ea46 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -100,7 +100,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; - hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + if (tcon && tcon->ses && tcon->ses->server) { + struct TCP_Server_Info *server = tcon->ses->server; + + spin_lock(&server->req_lock); + /* Request up to 2 credits but don't go over the limit. */ + if (server->credits >= server->max_credits) + hdr->CreditRequest = cpu_to_le16(0); + else + hdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 2)); + spin_unlock(&server->req_lock); + } else { + hdr->CreditRequest = cpu_to_le16(2); + } hdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) @@ -236,8 +250,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } cifs_mark_open_files_invalid(tcon); + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); + + if (tcon->use_persistent) + cifs_reopen_persistent_handles(tcon); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -574,59 +593,42 @@ vneg_out: return -EIO; } -int -SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct SMB2_sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct SMB2_sess_data *); + int result; + u64 previous_session; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[2]; +}; + +static int +SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct smb2_sess_setup_rsp *rsp = NULL; - struct kvec iov[2]; - int rc = 0; - int resp_buftype = CIFS_NO_BUFFER; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; - u16 blob_length = 0; - struct key *spnego_key = NULL; - char *security_blob = NULL; - unsigned char *ntlmssp_blob = NULL; - bool use_spnego = false; /* else use raw ntlmssp */ - - cifs_dbg(FYI, "Session Setup\n"); - - if (!server) { - WARN(1, "%s: server is NULL!\n", __func__); - return -EIO; - } - - /* - * If we are here due to reconnect, free per-smb session key - * in case signing was required. - */ - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - - /* - * If memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = true; - - /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ - if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) - ses->sectype = RawNTLMSSP; - -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); if (rc) return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + + /* if reconnect, we need to send previous sess id, otherwise it is 0 */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -642,199 +644,368 @@ ssetup_ntlmssp_authenticate: req->Capabilities = 0; req->Channel = 0; /* MBZ */ - iov[0].iov_base = (char *)req; + sess_data->iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; - if (ses->sectype == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; + return 0; +} - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } +static void +SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) +{ + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; +} - msg = spnego_key->payload.data[0]; - /* - * check version field to make sure that cifs.upcall is - * sending us a response in an expected form - */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "bad cifs.upcall version. Expected %d got %d", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - blob_length = msg->secblob_len; - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = blob_length; -#else - rc = -EOPNOTSUPP; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */ - ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - sizeof(struct _NEGOTIATE_MESSAGE), - ntlmssp_blob); */ - /* BB eventually need to add this */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - blob_length = sizeof(struct _NEGOTIATE_MESSAGE); - /* with raw NTLMSSP we don't encapsulate in SPNEGO */ - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else if (phase == NtLmAuthenticate) { - req->hdr.SessionId = ses->Suid; - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - nls_cp); - if (rc) { - cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", - rc); - goto ssetup_exit; /* BB double check error handling */ - } - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - blob_length, - ntlmssp_blob); */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else { - cifs_dbg(VFS, "illegal ntlmssp phase\n"); - rc = -EIO; - goto ssetup_exit; - } +static int +SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) +{ + int rc; + struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); - req->SecurityBufferLength = cpu_to_le16(blob_length); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - + 1 /* pad */ - 4 /* rfc1001 len */); + req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, blob_length - 1 /* pad */); + inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, - CIFS_LOG_ERROR | CIFS_NEG_OP); + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP); - kfree(security_blob); - rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; - if (resp_buftype != CIFS_NO_BUFFER && - rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; - } - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != - le16_to_cpu(rsp->SecurityBufferOffset)) { - cifs_dbg(VFS, "Invalid security buffer offset %d\n", - le16_to_cpu(rsp->SecurityBufferOffset)); - rc = -EIO; - goto ssetup_exit; + return rc; +} + +static int +SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) +{ + int rc = 0; + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (ses->server->sign && ses->server->ops->generate_signingkey) { + rc = ses->server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&ses->server->srv_mutex); + goto keygen_exit; } + } + if (!ses->server->session_estab) { + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROCESSING is not an error here but expected */ - rc = decode_ntlmssp_challenge(rsp->Buffer, - le16_to_cpu(rsp->SecurityBufferLength), ses); +keygen_exit: + if (!ses->server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + return rc; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct cifs_spnego_msg *msg; + struct key *spnego_key = NULL; + struct smb2_sess_setup_rsp *rsp = NULL; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; } + msg = spnego_key->payload.data[0]; /* - * BB eventually add code for SPNEGO decoding of NtlmChallenge blob, - * but at least the raw NTLMSSP case works. + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "bad cifs.upcall version. Expected %d got %d", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + ses->Suid = rsp->hdr.SessionId; + + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + SMB2_sess_free_buffer(sess_data); +} +#else +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} +#endif + +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data); + +static void +SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_rsp *rsp = NULL; + char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + /* - * No tcon so can't do - * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + * If memory allocation is successful, caller of this function + * frees it. */ - if (rc != 0) - goto ssetup_exit; + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out_err; + } + ses->ntlmssp->sesskey_per_smbsess = true; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out_err; + + ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto out; + } + + build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; + } else { + blob_length = sizeof(struct _NEGOTIATE_MESSAGE); + /* with raw NTLMSSP we don't encapsulate in SPNEGO */ + } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rc = 0; + + if (rc) + goto out; + + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + le16_to_cpu(rsp->SecurityBufferOffset)) { + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); + rc = -EIO; + goto out; + } + rc = decode_ntlmssp_challenge(rsp->Buffer, + le16_to_cpu(rsp->SecurityBufferLength), ses); + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + ses->Suid = rsp->hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); -ssetup_exit: - free_rsp_buf(resp_buftype, rsp); - - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&server->srv_mutex); - if (server->sign && server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - if (rc) { - cifs_dbg(FYI, - "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); - goto keygen_exit; - } - } - if (!server->session_estab) { - server->sequence_number = 0x2; - server->session_estab = true; - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + sess_data->result = 0; + sess_data->func = SMB2_sess_auth_rawntlmssp_authenticate; + return; } +out_err: + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} -keygen_exit: - if (!server->sign) { - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp = NULL; + unsigned char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; + req->hdr.SessionId = ses->Suid; + + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, + sess_data->nls_cp); + if (rc) { + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); + goto out; } - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + ses->Suid = rsp->hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} +static int +SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +{ + if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) + ses->sectype = RawNTLMSSP; + + switch (ses->sectype) { + case Kerberos: + sess_data->func = SMB2_auth_kerberos; + break; + case RawNTLMSSP: + sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype); + return -EOPNOTSUPP; + } + + return 0; +} + +int +SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + struct SMB2_sess_data *sess_data; + + cifs_dbg(FYI, "Session Setup\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + sess_data = kzalloc(sizeof(struct SMB2_sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = SMB2_select_sec(ses, sess_data); + if (rc) + goto out; + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + rc = sess_data->result; +out: + kfree(sess_data); return rc; } @@ -1164,7 +1335,7 @@ create_durable_v2_buf(struct cifs_fid *pfid) buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - get_random_bytes(buf->dcontext.CreateGuid, 16); + generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ @@ -2057,6 +2228,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); + buf->CreditRequest = buf->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(buf->CreditCharge); @@ -2243,6 +2415,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); + req->hdr.CreditRequest = req->hdr.CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(req->hdr.CreditCharge); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ff88d9f..fd3709e 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -276,7 +276,7 @@ struct smb2_sess_setup_req { __le32 Channel; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le64 PreviousSessionId; + __u64 PreviousSessionId; __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5e23f64..20af518 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -33,7 +33,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" - +#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ +#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; @@ -144,6 +145,54 @@ out: return rc; } +static int cifs_attrib_get(struct dentry *dentry, + struct inode *inode, void *value, + size_t size) +{ + ssize_t rc; + __u32 *pattribute; + + rc = cifs_revalidate_dentry_attr(dentry); + + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u32); + else if (size < sizeof(__u32)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pattribute = (__u32 *)value; + *pattribute = CIFS_I(inode)->cifsAttrs; + + return sizeof(__u32); +} + +static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, + void *value, size_t size) +{ + ssize_t rc; + __u64 * pcreatetime; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u64); + else if (size < sizeof(__u64)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pcreatetime = (__u64 *)value; + *pcreatetime = CIFS_I(inode)->createtime; + return sizeof(__u64); + + return rc; +} + + static int cifs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) @@ -168,10 +217,19 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -ENOMEM; goto out; } - /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:querying user xattr %s\n", __func__, name); + if (strcmp(name, CIFS_XATTR_ATTRIB) == 0) { + rc = cifs_attrib_get(dentry, inode, value, size); + break; + } else if (strcmp(name, CIFS_XATTR_CREATETIME) == 0) { + rc = cifs_creation_time_get(dentry, inode, value, size); + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 6fb8672..c0474ac 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -109,7 +109,7 @@ static inline void coda_dir_update_mtime(struct inode *dir) /* optimistically we can also act as if our nose bleeds. The * granularity of the mtime is coarse anyways so we might actually be * right most of the time. Note: we only do this for directories. */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); #endif } @@ -291,7 +291,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) /* rename */ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { const char *old_name = old_dentry->d_name.name; const char *new_name = new_dentry->d_name.name; @@ -299,6 +300,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, int new_length = new_dentry->d_name.len; int error; + if (flags) + return -EINVAL; + error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); diff --git a/fs/coda/file.c b/fs/coda/file.c index f47c748..6e0154e 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -38,27 +38,6 @@ coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } static ssize_t -coda_file_splice_read(struct file *coda_file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - ssize_t (*splice_read)(struct file *, loff_t *, - struct pipe_inode_info *, size_t, unsigned int); - struct coda_file_info *cfi; - struct file *host_file; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - splice_read = host_file->f_op->splice_read; - if (!splice_read) - splice_read = default_file_splice_read; - - return splice_read(host_file, ppos, pipe, count, flags); -} - -static ssize_t coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; @@ -75,7 +54,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; - coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; + coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); return ret; @@ -225,6 +204,6 @@ const struct file_operations coda_file_operations = { .open = coda_open, .release = coda_release, .fsync = coda_fsync, - .splice_read = coda_file_splice_read, + .splice_read = generic_file_splice_read, }; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 57e81cb..71dbe7e 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -271,7 +271,7 @@ int coda_setattr(struct dentry *de, struct iattr *iattr) memset(&vattr, 0, sizeof(vattr)); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); coda_iattr_to_vattr(iattr, &vattr); vattr.va_type = C_VNON; /* cannot set type */ diff --git a/fs/compat.c b/fs/compat.c index be6e48b..bd064a2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -54,20 +54,6 @@ #include <asm/ioctls.h> #include "internal.h" -int compat_log = 1; - -int compat_printk(const char *fmt, ...) -{ - va_list ap; - int ret; - if (!compat_log) - return 0; - va_start(ap, fmt); - ret = vprintk(fmt, ap); - va_end(ap); - return ret; -} - /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -562,7 +548,7 @@ ssize_t compat_rw_copy_check_uvector(int type, goto out; ret = -EINVAL; - if (nr_segs > UIO_MAXIOV || nr_segs < 0) + if (nr_segs > UIO_MAXIOV) goto out; if (nr_segs > fast_segs) { ret = -ENOMEM; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c1e9f29..f2d7402 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0387968..ad718e5 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -76,7 +76,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) sd_iattr->ia_uid = GLOBAL_ROOT_UID; sd_iattr->ia_gid = GLOBAL_ROOT_GID; sd_iattr->ia_atime = sd_iattr->ia_mtime = - sd_iattr->ia_ctime = current_fs_time(inode->i_sb); + sd_iattr->ia_ctime = current_time(inode); sd->s_iattr = sd_iattr; } /* attributes were changed atleast once in past */ @@ -113,7 +113,7 @@ static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -197,7 +197,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in return -ENOMEM; p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb); + p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode); configfs_set_inode_lock_class(sd, inode); init(inode); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c502c11..98f87fe 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/fscrypto.h> -#include <linux/ecryptfs.h> static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d491..9a28133 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include <keys/encrypted-type.h> -#include <keys/user-type.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> #include <linux/fscrypto.h> -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,11 +30,11 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) @@ -60,10 +55,9 @@ static int fname_encrypt(struct inode *inode, if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + ciphertext_len = max(iname->len, (u32)FS_CRYPTO_BLOCK_SIZE); + ciphertext_len = round_up(ciphertext_len, padding); + ciphertext_len = min(ciphertext_len, lim); if (ciphertext_len <= sizeof(buf)) { workbuf = buf; @@ -84,7 +78,7 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -105,20 +99,22 @@ static int fname_encrypt(struct inode *inode, } kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } oname->len = ciphertext_len; - return res; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +142,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +164,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +227,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +274,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +286,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +301,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +313,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +334,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +368,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263e..82f0285 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include <keys/encrypted-type.h> #include <keys/user-type.h> -#include <linux/random.h> #include <linux/scatterlist.h> -#include <uapi/linux/keyctl.h> #include <linux/fscrypto.h> static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ out: return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; + int keysize; u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ retry: if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,11 @@ retry: crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,7 +272,7 @@ got_key: crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115ac..6865663 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct inode *inode, - struct buffer_head *bh, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, + struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), - .size = bh->b_size, + .sector = sector, + .size = size, }; - struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct buffer_head *bh, void **entryp, - struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, sector_t sector, size_t size, + void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, mapping->host), - .size = bh->b_size, + .sector = sector, + .size = size, }; void *ret; void *entry = *entryp; @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, inode, &bh, vaddr); + error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), + bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), + bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: @@ -1034,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; - struct page *zero_page = get_huge_zero_page(); + struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *iter = data; + loff_t end = pos + length, done = 0; + ssize_t ret = 0; + + if (iov_iter_rw(iter) == READ) { + end = min(end, i_size_read(inode)); + if (pos >= end) + return 0; + + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return iov_iter_zero(min(length, end - pos), iter); + } + + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + return -EIO; + + while (pos < end) { + unsigned offset = pos & (PAGE_SIZE - 1); + struct blk_dax_ctl dax = { 0 }; + ssize_t map_len; + + dax.sector = iomap->blkno + + (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; + map_len = dax_map_atomic(iomap->bdev, &dax); + if (map_len < 0) { + ret = map_len; + break; + } + + dax.addr += offset; + map_len -= offset; + if (map_len > end - pos) + map_len = end - pos; + + if (iov_iter_rw(iter) == WRITE) + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + else + map_len = copy_to_iter(dax.addr, map_len, iter); + dax_unmap_atomic(iomap->bdev, &dax); + if (map_len <= 0) { + ret = map_len ? map_len : -EFAULT; + break; + } + + pos += map_len; + length -= map_len; + done += map_len; + } + + return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @iter: The addresses to do I/O from or to + * @ops: iomap ops passed from the file system + * + * This function performs read and write operations to directly mapped + * persistent memory. The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, done = 0; + unsigned flags = 0; + + if (iov_iter_rw(iter) == WRITE) + flags |= IOMAP_WRITE; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. We'll eventually need to shift this down even further so that + * we can check if we allocated blocks over a hole first. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, + iter, iomap_dax_actor); + if (ret <= 0) + break; + pos += ret; + done += ret; + } + + iocb->ki_pos += done; + return done ? done : ret; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); + +/** + * iomap_dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in their fault + * or mkwrite handler for DAX files. Assumes the caller has done all the + * necessary locking for the page fault to proceed successfully. + */ +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; + sector_t sector; + struct iomap iomap = { 0 }; + unsigned flags = 0; + int error, major = 0; + void *entry; + + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ + if (pos >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; + } + + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + flags |= IOMAP_WRITE; + + /* + * Note that we don't bother to use iomap_apply here: DAX required + * the file system block size to be equal the page size, which means + * that we never have to deal with more than a single extent here. + */ + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + if (error) + goto unlock_entry; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto unlock_entry; + } + + sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + + if (vmf->cow_page) { + switch (iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, + vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; + } + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (iomap.flags & IOMAP_F_NEW) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + } + error = dax_insert_mapping(mapping, iomap.bdev, sector, + PAGE_SIZE, &entry, vma, vmf); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return dax_load_hole(mapping, entry, vmf); + /*FALLTHRU*/ + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error < 0 && error != -EBUSY) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; +} +EXPORT_SYMBOL_GPL(iomap_dax_fault); +#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 592059f..354e2ab 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -97,9 +97,6 @@ EXPORT_SYMBOL_GPL(debugfs_use_file_finish); #define F_DENTRY(filp) ((filp)->f_path.dentry) -#define REAL_FOPS_DEREF(dentry) \ - ((const struct file_operations *)(dentry)->d_fsdata) - static int open_proxy_open(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); @@ -112,7 +109,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -143,7 +140,7 @@ static ret_type full_proxy_ ## name(proto) \ { \ const struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops = \ - REAL_FOPS_DEREF(dentry); \ + debugfs_real_fops(filp); \ int srcu_idx; \ ret_type r; \ \ @@ -176,7 +173,7 @@ static unsigned int full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); int srcu_idx; unsigned int r = 0; @@ -193,7 +190,7 @@ static unsigned int full_proxy_poll(struct file *filp, static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); - const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; @@ -209,7 +206,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp) replace_fops(filp, d_inode(dentry)->i_fop); kfree((void *)proxy_fops); fops_put(real_fops); - return 0; + return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, @@ -241,7 +238,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) goto out; } - real_fops = REAL_FOPS_DEREF(dentry); + real_fops = debugfs_real_fops(filp); real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 72361ba..f17fcf8 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -45,7 +45,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode->i_atime = inode->i_mtime = - inode->i_ctime = current_fs_time(sb); + inode->i_ctime = current_time(inode); } return inode; } @@ -748,7 +748,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, old_name = fsnotify_oldname_init(old_dentry->d_name.name); error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), - dentry); + dentry, 0); if (error) { fsnotify_oldname_free(old_name); goto exit; diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index bba5263..b3e8443 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -19,8 +19,4 @@ extern const struct file_operations debugfs_noop_file_operations; extern const struct file_operations debugfs_open_proxy_file_operations; extern const struct file_operations debugfs_full_proxy_file_operations; -struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops); - #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 79a5941..108df2e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - kuid_t root_uid; - kgid_t root_gid; - - root_uid = make_kuid(current_user_ns(), 0); - root_gid = make_kgid(current_user_ns(), 0); - if (!uid_valid(root_uid) || !gid_valid(root_gid)) - return -EINVAL; + kuid_t ptmx_uid = current_fsuid(); + kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); @@ -305,12 +300,12 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); - inode->i_uid = root_uid; - inode->i_gid = root_gid; + inode->i_uid = ptmx_uid; + inode->i_gid = ptmx_gid; d_add(dentry, inode); @@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -395,6 +389,7 @@ static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -403,28 +398,42 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; + error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); + if (error) + goto fail; + + error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; inode->i_ino = 1; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); - if (s->s_root) - return 0; + if (!s->s_root) { + pr_err("get root dentry failed\n"); + goto fail; + } - pr_err("get root dentry failed\n"); + error = mknod_ptmx(s); + if (error) + goto fail_dput; + return 0; +fail_dput: + dput(s->s_root); + s->s_root = NULL; fail: - return -ENOMEM; + return error; } /* @@ -436,43 +445,15 @@ fail: static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int error; - struct pts_mount_opts opts; - struct super_block *s; - - error = parse_mount_options(data, PARSE_MOUNT, &opts); - if (error) - return ERR_PTR(error); - - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (!s->s_root) { - error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) - goto out_undo_sget; - s->s_flags |= MS_ACTIVE; - } - - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); - - error = mknod_ptmx(s); - if (error) - goto out_undo_sget; - - return dget(s->s_root); - -out_undo_sget: - deactivate_locked_super(s); - return ERR_PTR(error); + return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); - ida_destroy(&fsi->allocated_ptys); + if (fsi) + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } @@ -559,7 +540,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); diff --git a/fs/direct-io.c b/fs/direct-io.c index 7c3ce73..fb9aa16 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -246,6 +246,9 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if ((dio->op == REQ_OP_READ) && ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; + /* ignore EFAULT if some IO has been done */ + if (unlikely(ret == -EFAULT) && transferred) + ret = 0; } if (ret == 0) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 963016c..609998d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1656,16 +1656,12 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; foreach_conn(stop_conn); + clean_writequeues(); + foreach_conn(free_conn); mutex_unlock(&connections_lock); work_stop(); - mutex_lock(&connections_lock); - clean_writequeues(); - - foreach_conn(free_conn); - - mutex_unlock(&connections_lock); kmem_cache_destroy(con_cache); } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 4ba1547..599a292 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -715,4 +715,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); +extern const struct xattr_handler *ecryptfs_xattr_handlers[]; + #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 9d153b6..cf390dc 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -577,7 +577,8 @@ out: static int ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { int rc; struct dentry *lower_old_dentry; @@ -587,6 +588,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *trap = NULL; struct inode *target_inode; + if (flags) + return -EINVAL; + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); @@ -927,7 +931,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } mutex_unlock(&crypt_stat->cs_mutex); - rc = inode_change_ok(inode, ia); + rc = setattr_prepare(dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -1005,15 +1009,14 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int rc = 0; + int rc; struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!d_inode(lower_dentry)->i_op->setxattr) { + if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) { rc = -EOPNOTSUPP; goto out; } - rc = vfs_setxattr(lower_dentry, name, value, size, flags); if (!rc && inode) fsstack_copy_attr_all(inode, d_inode(lower_dentry)); @@ -1025,15 +1028,14 @@ ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, const char *name, void *value, size_t size) { - int rc = 0; + int rc; - if (!lower_inode->i_op->getxattr) { + if (!(lower_inode->i_opflags & IOP_XATTR)) { rc = -EOPNOTSUPP; goto out; } inode_lock(lower_inode); - rc = lower_inode->i_op->getxattr(lower_dentry, lower_inode, - name, value, size); + rc = __vfs_getxattr(lower_dentry, lower_inode, name, value, size); inode_unlock(lower_inode); out: return rc; @@ -1066,19 +1068,22 @@ out: return rc; } -static int ecryptfs_removexattr(struct dentry *dentry, const char *name) +static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, + const char *name) { - int rc = 0; + int rc; struct dentry *lower_dentry; + struct inode *lower_inode; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!d_inode(lower_dentry)->i_op->removexattr) { + lower_inode = ecryptfs_inode_to_lower(inode); + if (!(lower_inode->i_opflags & IOP_XATTR)) { rc = -EOPNOTSUPP; goto out; } - inode_lock(d_inode(lower_dentry)); - rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - inode_unlock(d_inode(lower_dentry)); + inode_lock(lower_inode); + rc = __vfs_removexattr(lower_dentry, name); + inode_unlock(lower_inode); out: return rc; } @@ -1089,10 +1094,7 @@ const struct inode_operations ecryptfs_symlink_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .getattr = ecryptfs_getattr_link, - .setxattr = ecryptfs_setxattr, - .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, - .removexattr = ecryptfs_removexattr }; const struct inode_operations ecryptfs_dir_iops = { @@ -1107,18 +1109,43 @@ const struct inode_operations ecryptfs_dir_iops = { .rename = ecryptfs_rename, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, - .setxattr = ecryptfs_setxattr, - .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, - .removexattr = ecryptfs_removexattr }; const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .getattr = ecryptfs_getattr, - .setxattr = ecryptfs_setxattr, - .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, - .removexattr = ecryptfs_removexattr +}; + +static int ecryptfs_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ecryptfs_getxattr(dentry, inode, name, buffer, size); +} + +static int ecryptfs_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (value) + return ecryptfs_setxattr(dentry, inode, name, value, size, flags); + else { + BUG_ON(flags != XATTR_REPLACE); + return ecryptfs_removexattr(dentry, inode, name); + } +} + +const struct xattr_handler ecryptfs_xattr_handler = { + .prefix = "", /* match anything */ + .get = ecryptfs_xattr_get, + .set = ecryptfs_xattr_set, +}; + +const struct xattr_handler *ecryptfs_xattr_handlers[] = { + &ecryptfs_xattr_handler, + NULL }; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 6120044..151872d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -529,6 +529,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; + s->s_xattr = ecryptfs_xattr_handlers; s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 9c3437c..1f0c471 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/scatterlist.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -422,7 +423,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) struct inode *lower_inode = d_inode(lower_dentry); int rc; - if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { + if (!(lower_inode->i_opflags & IOP_XATTR)) { printk(KERN_WARNING "No support for setting xattr in lower filesystem\n"); rc = -ENOSYS; @@ -436,15 +437,13 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) goto out; } inode_lock(lower_inode); - size = lower_inode->i_op->getxattr(lower_dentry, lower_inode, - ECRYPTFS_XATTR_NAME, - xattr_virt, PAGE_SIZE); + size = __vfs_getxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, + xattr_virt, PAGE_SIZE); if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode, - ECRYPTFS_XATTR_NAME, - xattr_virt, size, 0); + rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, + xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 1d73fc6..71fcccc 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -24,7 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: @@ -105,7 +105,10 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, inode->i_private = var; - efivar_entry_add(var, &efivarfs_list); + err = efivar_entry_add(var, &efivarfs_list); + if (err) + goto out; + d_instantiate(dentry, inode); dget(dentry); out: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 688ccc1..d7a7c53 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -157,12 +157,14 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, goto fail_inode; } + efivar_entry_size(entry, &size); + err = efivar_entry_add(entry, &efivarfs_list); + if (err) + goto fail_inode; + /* copied by the above to local storage in the dentry. */ kfree(name); - efivar_entry_size(entry, &size); - efivar_entry_add(entry, &efivarfs_list); - inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); @@ -182,7 +184,10 @@ fail: static int efivarfs_destroy(struct efivar_entry *entry, void *data) { - efivar_entry_remove(entry); + int err = efivar_entry_remove(entry); + + if (err) + return err; kfree(entry); return 0; } @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index f69a1b5..42f9a0a 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -137,7 +137,7 @@ Espan: bad_entry: EXOFS_ERR( "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " - "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", + "offset=%lu, inode=0x%llx, rec_len=%d, name_len=%d\n", dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no)), rec_len, p->name_len); @@ -416,7 +416,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, if (likely(!err)) err = exofs_commit_chunk(page, pos, len); exofs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); return err; } @@ -503,7 +503,7 @@ got_it: de->inode_no = cpu_to_le64(inode->i_ino); exofs_set_de_type(de, inode); err = exofs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); sbi->s_numfiles++; @@ -554,7 +554,7 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) dir->inode_no = 0; if (likely(!err)) err = exofs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); sbi->s_numfiles--; out: diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 9dc4c6d..d8072bc 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -778,7 +778,7 @@ try_again: fail: EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", inode->i_ino, page->index, ret); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); unlock_page(page); return ret; } @@ -1007,7 +1007,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) @@ -1034,7 +1034,7 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) if (unlikely(error)) return error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (unlikely(error)) return error; @@ -1313,7 +1313,7 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); oi->i_commit_size = inode->i_size = 0; spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 622a686..7295cd7 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -142,7 +142,7 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); @@ -227,7 +227,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry) } static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -237,6 +238,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, struct exofs_dir_entry *old_de; int err = -ENOENT; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_de = exofs_find_entry(old_dir, old_dentry, &old_page); if (!old_de) goto out; @@ -261,7 +265,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_de) goto out_dir; err = exofs_set_link(new_dir, new_de, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -275,7 +279,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_link_count(new_dir); } - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); exofs_delete_entry(old_de, old_page); mark_inode_dirty(old_inode); @@ -310,7 +314,7 @@ const struct inode_operations exofs_dir_inode_operations = { .mkdir = exofs_mkdir, .rmdir = exofs_rmdir, .mknod = exofs_mknod, - .rename = exofs_rename, + .rename = exofs_rename, .setattr = exofs_setattr, }; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 207ba8d..a4b531b 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!result) - result = ERR_PTR(-ESTALE); - if (IS_ERR(result)) - return result; + if (PTR_ERR(result) == -ENOMEM) + return ERR_CAST(result); + if (IS_ERR_OR_NULL(result)) + return ERR_PTR(-ESTALE); if (d_is_dir(result)) { /* @@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); + if (err != -ENOMEM) + err = -ESTALE; return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh); diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e..36bea5a 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,5 +1,6 @@ config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 42f1d18..79dafa7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -190,15 +190,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); } break; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 61ad490..d9650c9 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -471,7 +471,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, err = ext2_commit_chunk(page, pos, len); ext2_put_page(page); if (update_times) - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); } @@ -561,7 +561,7 @@ got_it: de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); err = ext2_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); /* OFFSET_CACHE */ @@ -610,7 +610,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; err = ext2_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); out: diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 06af2f9..37e2be7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; +extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5efeefe..a0e1478 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> +#include <linux/iomap.h> +#include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX +static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + inode_lock_shared(inode); + ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = file_update_time(file); + if (ret) + goto out_unlock; + + ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + mark_inode_dirty(inode); + } + +out_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + /* * The lock ordering for ext2 DAX fault paths is: * @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_fault(vma, vmf, ext2_get_block); + ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } -/* - * We have mostly NULL's here: the current defaults are ok for - * the ext2 filesystem. - */ +static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_write_iter(iocb, from); +#endif + return generic_file_write_iter(iocb, from); +} + const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .read_iter = ext2_file_read_iter, + .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, @@ -172,16 +234,14 @@ const struct file_operations ext2_file_operations = { .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; const struct inode_operations ext2_file_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext2_listxattr, - .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index efe5fb2..395fc074c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -465,6 +465,11 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode, for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext2_get_group_desc(sb, group, &bh2); + if (!gdp) { + if (++group == sbi->s_groups_count) + group = 0; + continue; + } brelse(bitmap_bh); bitmap_bh = read_inode_bitmap(sb, group); if (!bitmap_bh) { @@ -551,7 +556,7 @@ got: inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_flags = ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d5c7d09..41b8b44 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" @@ -594,7 +595,7 @@ static void ext2_splice_branch(struct inode *inode, if (where->bh) mark_buffer_dirty_inode(where->bh, inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); } @@ -618,10 +619,10 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, + u32 *bno, bool *new, bool *boundary, int create) { - int err = -EIO; + int err; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -638,13 +639,12 @@ static int ext2_get_blocks(struct inode *inode, depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) - return (err); + return -EIO; partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; - clear_buffer_new(bh_result); goto got_it; } } @@ -733,6 +732,16 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { + int i; + + /* + * We must unmap blocks before zeroing so that writeback cannot + * overwrite zeros with stale data from block device page cache. + */ + for (i = 0; i < count; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) + i); + } /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's @@ -745,15 +754,15 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else - set_buffer_new(bh_result); + } else { + *new = true; + } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -762,22 +771,87 @@ cleanup: brelse(partial->bh); partial--; } + if (err > 0) + *bno = le32_to_cpu(chain[depth-1].key); return err; } -int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int ret = ext2_get_blocks(inode, iblock, max_blocks, - bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, + create); + if (ret <= 0) + return ret; + + map_bh(bh_result, inode->i_sb, bno); + bh_result->b_size = (ret << inode->i_blkbits); + if (new) + set_buffer_new(bh_result); + if (boundary) + set_buffer_boundary(bh_result); + return 0; + +} + +#ifdef CONFIG_FS_DAX +static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, first_block, max_blocks, + &bno, &new, &boundary, flags & IOMAP_WRITE); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64)first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = 1 << blkbits; + } else { + iomap->type = IOMAP_MAPPED; + iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->length = (u64)ret << blkbits; + iomap->flags |= IOMAP_F_MERGED; } - return ret; + if (new) + iomap->flags |= IOMAP_F_NEW; + return 0; } +static int +ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + if (iomap->type == IOMAP_MAPPED && + written < length && + (flags & IOMAP_WRITE)) + ext2_write_failed(inode->i_mapping, offset + length); + return 0; +} + +struct iomap_ops ext2_iomap_ops = { + .iomap_begin = ext2_iomap_begin, + .iomap_end = ext2_iomap_end, +}; +#endif /* CONFIG_FS_DAX */ + int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -863,11 +937,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, - DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); + if (WARN_ON_ONCE(IS_DAX(inode))) + return -EIO; + + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; @@ -1236,7 +1309,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) __ext2_truncate_blocks(inode, newsize); dax_sem_up_write(EXT2_I(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = current_time(inode); if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); sync_inode_metadata(inode, 1); @@ -1580,7 +1653,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index b386af2..9d61742 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -79,7 +79,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ei->i_flags = flags; ext2_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode_unlock(inode); mark_inode_dirty(inode); @@ -103,7 +103,7 @@ setflags_out: } inode_lock(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode->i_generation = generation; inode_unlock(inode); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index d446203..814e405 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -221,7 +221,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (err) return err; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); @@ -328,7 +328,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) } static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry ) + struct inode * new_dir, struct dentry * new_dentry, + unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); @@ -338,6 +339,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; int err; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + err = dquot_initialize(old_dir); if (err) goto out; @@ -372,7 +376,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (!new_de) goto out_dir; ext2_set_link(new_dir, new_de, new_page, old_inode, 1); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -388,7 +392,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = CURRENT_TIME_SEC; + old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); ext2_delete_entry (old_de, old_page); @@ -428,10 +432,7 @@ const struct inode_operations ext2_dir_inode_operations = { .mknod = ext2_mknod, .rename = ext2_rename, #ifdef CONFIG_EXT2_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext2_listxattr, - .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, @@ -441,10 +442,7 @@ const struct inode_operations ext2_dir_inode_operations = { const struct inode_operations ext2_special_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext2_listxattr, - .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 1d93795..6cb042b 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1543,7 +1543,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 3495d8a..8437b19 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -25,10 +25,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .get_link = page_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext2_listxattr, - .removexattr = generic_removexattr, #endif }; @@ -37,9 +34,6 @@ const struct inode_operations ext2_fast_symlink_inode_operations = { .get_link = simple_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext2_listxattr, - .removexattr = generic_removexattr, #endif }; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index b7f896f..fbdb8f1 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -691,7 +691,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* Update the inode. */ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) { error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c6601a4..dfa5199 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -193,15 +193,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 02ddec6..fdb1954 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -128,12 +128,12 @@ static void debug_print_tree(struct ext4_sb_info *sbi) node = rb_first(&sbi->system_blks); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); - printk("%s%llu-%llu", first ? "" : ", ", + printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", entry->start_blk, entry->start_blk + entry->count - 1); first = 0; node = rb_next(node); } - printk("\n"); + printk(KERN_CONT "\n"); } int ext4_setup_system_zone(struct super_block *sb) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 67415e0..e8b3650 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -260,11 +260,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &de_name, &fstr); + de_name = fstr; fstr.len = save_len; - if (err < 0) + if (err) goto errout; if (!dir_emit(ctx, - fstr.name, err, + de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; @@ -627,7 +628,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; - int nlen, rlen; + int rlen; unsigned int offset = 0; char *top; @@ -637,7 +638,6 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; - nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ea31931..282a51b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -262,6 +262,9 @@ struct ext4_io_submit { (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -1117,9 +1120,15 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ @@ -1636,26 +1645,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * Feature set definitions */ -/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ -#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) -#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d7ccb7f..c930a01 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4679,6 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, unsigned int credits; loff_t epos; + BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* @@ -4693,13 +4694,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); - /* - * We can only call ext_depth() on extent based inodes - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - depth = ext_depth(inode); - else - depth = -1; + depth = ext_depth(inode); retry: while (ret >= 0 && len) { @@ -4966,13 +4961,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - lblk; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -5035,12 +5025,8 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, unsigned int credits, blkbits = inode->i_blkbits; map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk); + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several @@ -5734,6 +5720,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 261ac37..2a822d3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -91,9 +91,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; @@ -134,18 +132,16 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) { size_t length = iov_iter_count(from); loff_t pos = iocb->ki_pos; - blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, len; map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; + map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); len = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); @@ -171,8 +167,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) ret = generic_write_sync(iocb, ret); - if (o_direct) - blk_finish_plug(&plug); return ret; @@ -703,6 +697,7 @@ const struct file_operations ext4_file_operations = { .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, + .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, @@ -711,10 +706,7 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5c43725..88effb1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -61,6 +61,13 @@ static int ext4_sync_parent(struct inode *inode) break; iput(inode); inode = next; + /* + * The directory inode may have gone through rmdir by now. But + * the inode itself and its blocks are still allocated (we hold + * a reference to the inode so it didn't go through + * ext4_evict_inode()) and so we are safe to flush metadata + * blocks and the inode. + */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -107,7 +114,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!journal) { ret = __generic_file_fsync(file, start, end, datasync); - if (!ret && !hlist_empty(&inode->i_dentry)) + if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) goto issue_flush; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9e66cd1..170421e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -802,7 +802,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, } else inode_init_owner(inode, dir, mode); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6ea25a..9c06472 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -647,11 +647,19 @@ found: /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + ext4_lblk_t i; + + for (i = 0; i < map->m_len; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + map->m_pblk + i); + } ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1649,6 +1657,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } @@ -3526,35 +3536,31 @@ out: static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { - int unlocked = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; - if (ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) - inode_dio_end(inode); - else - unlocked = 1; - } + /* + * Shared inode_lock is enough for us - it protects against concurrent + * writes & truncates and since we take care of writing back page cache, + * we are protected against page writeback as well. + */ + inode_lock_shared(inode); if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, - NULL, unlocked ? 0 : DIO_LOCKING); + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); } else { + size_t count = iov_iter_count(iter); + + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, ext4_dio_get_block, - NULL, NULL, - unlocked ? 0 : DIO_LOCKING); + NULL, NULL, 0); } - if (unlocked) - inode_dio_end(inode); +out_unlock: + inode_unlock_shared(inode); return ret; } @@ -3890,7 +3896,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3919,7 +3925,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) @@ -4414,7 +4420,7 @@ static inline void ext4_iget_extra_inode(struct inode *inode, int ext4_get_projid(struct inode *inode, kprojid_t *projid) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; @@ -4481,7 +4487,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); @@ -4814,14 +4820,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); @@ -4885,8 +4891,7 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT) && + BUG_ON(!ext4_has_feature_project(inode->i_sb) && i_projid != EXT4_DEF_PROJID); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && @@ -5073,7 +5078,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) int orphan = 0; const unsigned int ia_valid = attr->ia_valid; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1bb7df5..bf5ae8e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,8 +19,6 @@ #include "ext4_jbd2.h" #include "ext4.h" -#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) - /** * Swap memory between @a and @b for @len bytes. * @@ -310,8 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else @@ -772,6 +769,9 @@ resizefs_out: #ifdef CONFIG_EXT4_FS_ENCRYPTION struct fscrypt_policy policy; + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) @@ -842,8 +842,7 @@ resizefs_out: ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (ext4_has_feature_project(inode->i_sb)) { fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, EXT4_I(inode)->i_projid); } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 3ef1df6..1aba469 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -27,16 +27,15 @@ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; -#define mb_debug(n, fmt, a...) \ - do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: ", \ - __FILE__, __LINE__, __func__); \ - printk(fmt, ## a); \ - } \ - } while (0) +#define mb_debug(n, fmt, ...) \ +do { \ + if ((n) <= ext4_mballoc_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ + } \ +} while (0) #else -#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) +#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a920c5d..6fc14de 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 34c0142..104f8bf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -577,12 +577,13 @@ static inline unsigned dx_node_limit(struct inode *dir) static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); - printk(KERN_DEBUG "%s index ", label); + printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { - printk("%x->%lu ", i ? dx_get_hash(entries + i) : - 0, (unsigned long)dx_get_block(entries + i)); + printk(KERN_CONT " %x->%lu", + i ? dx_get_hash(entries + i) : 0, + (unsigned long)dx_get_block(entries + i)); } - printk("\n"); + printk(KERN_CONT "\n"); } struct stats @@ -639,7 +640,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_alloc_buffer( dir, len, &fname_crypto_str); - if (res < 0) + if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " @@ -647,7 +648,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); - if (res < 0) { + if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" @@ -679,7 +680,7 @@ static struct stats dx_show_leaf(struct inode *dir, } de = ext4_next_entry(de, size); } - printk("(%i)\n", names); + printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } @@ -798,7 +799,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; - dxtrace(printk(".")); + dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else @@ -810,7 +811,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, at = entries; while (n--) { - dxtrace(printk(",")); + dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; @@ -821,7 +822,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dxtrace(printk(KERN_CONT " %x->%u\n", + at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; @@ -1011,7 +1013,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); - if (err < 0) { + if (err) { count = err; goto errout; } @@ -2044,33 +2046,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } @@ -3144,7 +3144,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); disk_link.name = (char *) sd; @@ -3880,12 +3880,9 @@ const struct inode_operations ext4_dir_inode_operations = { .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, - .rename2 = ext4_rename2, + .rename = ext4_rename2, .setattr = ext4_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, @@ -3893,10 +3890,7 @@ const struct inode_operations ext4_dir_inode_operations = { const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, .get_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a6132a7..0094923 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -88,7 +88,7 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_error) { SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); } bh = head = page_buffers(page); /* @@ -405,14 +405,12 @@ int ext4_bio_write_page(struct ext4_io_submit *io, { struct page *data_page = NULL; struct inode *inode = page->mapping->host; - unsigned block_start, blocksize; + unsigned block_start; struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; int nr_to_submit = 0; - blocksize = 1 << inode->i_blkbits; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ec8708..20da99d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -78,6 +78,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum); /* * Lock ordering @@ -595,14 +597,15 @@ void __ext4_std_error(struct super_block *sb, const char *function, void __ext4_abort(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { + struct va_format vaf; va_list args; save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, - function, line); - vprintk(fmt, args); - printk("\n"); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); va_end(args); if ((sb->s_flags & MS_RDONLY) == 0) { @@ -1267,7 +1270,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1327,6 +1330,7 @@ static const match_table_t tokens = { {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, + {Opt_prjquota, "prjquota"}, {Opt_barrier, "barrier=%u"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, @@ -1546,8 +1550,11 @@ static const struct mount_opts { MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_offusrjquota, 0, MOPT_Q}, @@ -1836,13 +1843,17 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && - (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " - "mount options ignored."); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { + ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -2705,12 +2716,12 @@ static void print_daily_error_info(unsigned long arg) es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", @@ -2719,12 +2730,12 @@ static void print_daily_error_info(unsigned long arg) es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) - printk(": inode %u", + printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) - printk(": block %llu", (unsigned long long) + printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); - printk("\n"); + printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } @@ -2741,7 +2752,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; - sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2768,8 +2778,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } - sb_end_write(sb); - return ret; } @@ -2834,19 +2842,43 @@ cont_thread: mutex_unlock(&eli->li_list_mtx); goto exit_thread; } - list_for_each_safe(pos, n, &eli->li_request_list) { + int err = 0; + int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; + if (time_before(jiffies, elr->lr_next_sched)) { + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* + * We hold sb->s_umount, sb can not + * be removed from the list, it is + * now safe to drop li_list_mtx + */ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); + n = pos->next; } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + continue; + } + if (!progress) { + elr->lr_next_sched = jiffies + + (prandom_u32() + % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); } - if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } @@ -3179,6 +3211,8 @@ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct inode *j_inode; + unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); @@ -3209,10 +3243,23 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the internal journal blocks as well */ + + /* + * Add the internal journal blocks whether the journal has been + * loaded or not + */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - + else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + j_inode = ext4_get_journal_inode(sb, j_inum); + if (j_inode) { + j_blocks = j_inode->i_size >> sb->s_blocksize_bits; + overhead += EXT4_NUM_B2C(sbi, j_blocks); + iput(j_inode); + } else { + ext4_msg(sb, KERN_ERR, "can't get journal size"); + } + } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); @@ -4208,18 +4255,16 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) write_unlock(&journal->j_state_lock); } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; - journal_t *journal; - - BUG_ON(!ext4_has_feature_journal(sb)); - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ + /* + * Test for the existence of a valid inode on disk. Bad things + * happen if we iget() an unused inode, as the subsequent iput() + * will try to delete it. + */ journal_inode = ext4_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); @@ -4239,6 +4284,20 @@ static journal_t *ext4_get_journal(struct super_block *sb, iput(journal_inode); return NULL; } + return journal_inode; +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!ext4_has_feature_journal(sb)); + + journal_inode = ext4_get_journal_inode(sb, journal_inum); + if (!journal_inode) + return NULL; journal = jbd2_journal_init_inode(journal_inode); if (!journal) { @@ -5250,12 +5309,18 @@ static int ext4_enable_quotas(struct super_block *sb) le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), + }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, - DQUOT_USAGE_ENABLED); + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 4d83d9e..557b3b0 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, char *caddr, *paddr = NULL; struct fscrypt_str cstr, pstr; struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 max_size = inode->i_sb->s_blocksize; @@ -49,7 +48,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; } /* Symlink is encrypted */ @@ -65,16 +63,14 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; + paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; - paddr = pstr.name; - /* Null-terminate the name */ - if (res <= pstr.len) - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; if (cpage) put_page(cpage); set_delayed_call(done, kfree_link, paddr); @@ -90,28 +86,19 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { .readlink = generic_readlink, .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, }; const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = ext4_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .get_link = simple_get_link, .setattr = ext4_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, }; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 73bcfd4..42145be 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,14 +223,18 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); +#ifdef CONFIG_EXT4_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), +#ifdef CONFIG_EXT4_FS_ENCRYPTION ATTR_LIST(encryption), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2eb935c..d77be9e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,18 +61,12 @@ #include "acl.h" #ifdef EXT4_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - printk(KERN_DEBUG "block %pg:%lu: ", \ - bh->b_bdev, (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) +# define ea_idebug(inode, fmt, ...) \ + printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) \ + printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ + bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) @@ -199,6 +193,8 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, } while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_block != 0) + return -EFSCORRUPTED; if (entry->e_value_size != 0 && (value_start + le16_to_cpu(entry->e_value_offs) < (void *)e + sizeof(__u32) || @@ -239,7 +235,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, int error = -EFSCORRUPTED; if (((void *) header >= end) || - (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC))) + (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; error = ext4_xattr_check_names(entry, end, entry); errout: @@ -641,7 +637,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -661,7 +657,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -669,7 +665,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } @@ -691,7 +687,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -725,8 +721,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) + if (last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); @@ -1318,18 +1313,19 @@ retry: */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, - void *from, size_t n, int blocksize) + void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; + /* We always shift xattr headers further thus offsets get lower */ + BUG_ON(value_offs_shift > 0); + /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; - BUG_ON(new_offs + le32_to_cpu(last->e_value_size) - > blocksize); last->e_value_offs = cpu_to_le16(new_offs); } } @@ -1338,6 +1334,141 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, } /* + * Move xattr pointed to by 'entry' from inode into external xattr block + */ +static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t value_offs, value_size; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + .name_index = entry->e_name_index, + }; + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int error; + + value_offs = le16_to_cpu(entry->e_value_offs); + value_size = le32_to_cpu(entry->e_value_size); + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + buffer = kmalloc(value_size, GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!is || !bs || !buffer || !b_entry_name) { + error = -ENOMEM; + goto out; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto out; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto out; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto out; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = value_size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto out; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto out; + error = 0; +out: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + + return error; +} + +static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + int isize_diff, size_t ifree, + size_t bfree, int *total_ino) +{ + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + struct ext4_xattr_entry *small_entry; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *last; + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ + unsigned int min_total_size; + int error; + + while (isize_diff > ifree) { + entry = NULL; + small_entry = NULL; + min_total_size = ~0U; + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry == NULL) + return -ENOSPC; + entry = small_entry; + } + + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); + if (error) + return error; + + *total_ino -= entry_size; + ifree += total_size; + bfree -= total_size; + } + + return 0; +} + +/* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ @@ -1345,14 +1476,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; - struct ext4_xattr_ibody_find *is = NULL; - struct ext4_xattr_block_find *bs = NULL; - char *buffer = NULL, *b_entry_name = NULL; - size_t min_offs, free; + size_t min_offs; + size_t ifree, bfree; int total_ino; - void *base, *start, *end; + void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ @@ -1368,34 +1496,24 @@ retry: goto out; header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ - base = start = entry; + base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= isize_diff) { - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - new_extra_isize, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, - inode->i_sb->s_blocksize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - goto out; - } + ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); + if (ifree >= isize_diff) + goto shift; /* * Enough free space isn't available in the inode, check if @@ -1413,146 +1531,44 @@ retry: goto cleanup; } base = BHDR(bh); - first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < isize_diff) { + bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, + NULL); + if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; brelse(bh); goto retry; } - error = -1; + error = -ENOSPC; goto cleanup; } } else { - free = inode->i_sb->s_blocksize; + bfree = inode->i_sb->s_blocksize; } - while (isize_diff > 0) { - size_t offs, size, entry_size; - struct ext4_xattr_entry *small_entry = NULL; - struct ext4_xattr_info i = { - .value = NULL, - .value_len = 0, - }; - unsigned int total_size; /* EA entry size + value size */ - unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ - unsigned int min_total_size = ~0U; - - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - if (!is || !bs) { - error = -ENOMEM; - goto cleanup; - } - - is->s.not_found = -ENODATA; - bs->s.not_found = -ENODATA; - is->iloc.bh = NULL; - bs->bh = NULL; - - last = IFIRST(header); - /* Find the entry best suited to be pushed into EA block */ - entry = NULL; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= free && total_size < min_total_size) { - if (total_size < isize_diff) { - small_entry = last; - } else { - entry = last; - min_total_size = total_size; - } - } - } - - if (entry == NULL) { - if (small_entry) { - entry = small_entry; - } else { - if (!tried_min_extra_isize && - s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - kfree(is); is = NULL; - kfree(bs); bs = NULL; - brelse(bh); - goto retry; - } - error = -1; - goto cleanup; - } - } - offs = le16_to_cpu(entry->e_value_offs); - size = le32_to_cpu(entry->e_value_size); - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - i.name_index = entry->e_name_index, - buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!buffer || !b_entry_name) { - error = -ENOMEM; - goto cleanup; + error = ext4_xattr_make_inode_space(handle, inode, raw_inode, + isize_diff, ifree, bfree, + &total_ino); + if (error) { + if (error == -ENOSPC && !tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; } - /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + offs, - EXT4_XATTR_SIZE(size)); - memcpy(b_entry_name, entry->e_name, entry->e_name_len); - b_entry_name[entry->e_name_len] = '\0'; - i.name = b_entry_name; - - error = ext4_get_inode_loc(inode, &is->iloc); - if (error) - goto cleanup; - - error = ext4_xattr_ibody_find(inode, &i, is); - if (error) - goto cleanup; - - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto cleanup; - total_ino -= entry_size; - - entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) - shift_bytes = isize_diff; - else - shift_bytes = entry_size + EXT4_XATTR_SIZE(size); - /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, -shift_bytes, - (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize + shift_bytes, - (void *)header, total_ino, inode->i_sb->s_blocksize); - - isize_diff -= shift_bytes; - EXT4_I(inode)->i_extra_isize += shift_bytes; - header = IHDR(inode, raw_inode); - - i.name = b_entry_name; - i.value = buffer; - i.value_len = size; - error = ext4_xattr_block_find(inode, &i, bs); - if (error) - goto cleanup; - - /* Add entry which was removed from the inode into the block */ - error = ext4_xattr_block_set(handle, inode, &i, bs); - if (error) - goto cleanup; - kfree(b_entry_name); - kfree(buffer); - b_entry_name = NULL; - buffer = NULL; - brelse(is->iloc.bh); - kfree(is); - kfree(bs); + goto cleanup; } +shift: + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino); + EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); @@ -1560,12 +1576,6 @@ out: return 0; cleanup: - kfree(b_entry_name); - kfree(buffer); - if (is) - brelse(is->iloc.bh); - kfree(is); - kfree(bs); brelse(bh); /* * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode @@ -1734,7 +1744,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, *name++; } - if (entry->e_value_block == 0 && entry->e_value_size != 0) { + if (entry->e_value_size != 0) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4dcc9e2..6fe23af 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ fail: return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = f2fs_kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -210,12 +212,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; @@ -230,7 +230,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index b2334d1..2c68518 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -41,7 +41,6 @@ extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f94d01e..7e9b504 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,7 +28,7 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_bios(sbi); @@ -267,7 +267,6 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); - struct blk_plug plug; long diff, written; /* collect a number of dirty meta pages and write together */ @@ -280,9 +279,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); - blk_start_plug(&plug); written = sync_meta_pages(sbi, META, wbc->nr_to_write); - blk_finish_plug(&plug); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -388,6 +385,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -491,7 +491,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) spin_lock(&im->ino_lock); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_ORPHAN)) { + if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); return -ENOSPC; } @@ -531,8 +531,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); + + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } - inode = f2fs_iget(sbi->sb, ino); + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -546,6 +558,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -554,7 +578,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -578,7 +602,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -639,45 +663,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } -static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, - block_t cp_addr, unsigned long long *version) +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; - unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; + size_t crc_offset = 0; __u32 crc = 0; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp1; + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset)) - goto invalid_cp1; + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } - pre_version = cur_cp_version(cp_block); + *version = cur_cp_version(*cp_block); + return 0; +} - /* Read the 2nd cp block in this CP pack */ - cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); +static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, + block_t cp_addr, unsigned long long *version) +{ + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; + unsigned long long cur_version = 0, pre_version = 0; + int err; - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) - goto invalid_cp2; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) + goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset)) + cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -972,10 +1006,40 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -984,19 +1048,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk)) - invalidate = true; - /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); @@ -1036,10 +1091,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1054,23 +1111,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); @@ -1137,14 +1179,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) @@ -1152,6 +1186,17 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); return 0; } @@ -1190,6 +1235,18 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + unblock_operations(sbi); + goto out; + } + /* * update checkpoint pack index * Increase the version number so that @@ -1205,6 +1262,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ccb401e..9ae194f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -34,6 +34,11 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { fscrypt_release_ctx(bio->bi_private); @@ -70,7 +75,7 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); @@ -626,11 +631,13 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) ssize_t ret = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from)); - map.m_next_pgofs = NULL; + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - if (f2fs_encrypted_inode(inode)) - return 0; + map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { ret = f2fs_convert_inline_inode(inode); @@ -672,6 +679,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, bool allocated = false; block_t blkaddr; + if (!maxblocks) + return 0; + map->m_len = 0; map->m_flags = 0; @@ -783,6 +793,7 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; + allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -966,8 +977,8 @@ out: return ret; } -struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; @@ -1284,7 +1295,7 @@ write: if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1344,6 +1355,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; + int nwritten = 0; pagevec_init(&pvec, 0); @@ -1418,6 +1430,8 @@ continue_unlock: done_index = page->index + 1; done = 1; break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1439,6 +1453,10 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1480,7 +1498,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); return ret; @@ -1518,8 +1535,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) && - len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) return 0; if (f2fs_has_inline_data(inode) || @@ -1616,7 +1632,7 @@ repeat: if (err) goto fail; - if (need_balance && has_not_enough_free_secs(sbi, 0)) { + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); @@ -1633,22 +1649,12 @@ repeat: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; - - if ((pos & PAGE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_SIZE); - goto out_update; - } + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { struct bio *bio; @@ -1676,11 +1682,6 @@ repeat: goto fail; } } -out_update: - if (!PageUptodate(page)) - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; fail: @@ -1698,11 +1699,26 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); + } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); - +unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; @@ -1873,6 +1889,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include <linux/migrate.h> + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1885,4 +1953,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index badd407..fb245bd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,6 +45,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; @@ -54,6 +55,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); @@ -154,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -228,8 +232,13 @@ static int stat_show(struct seq_file *s, void *v) si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -311,6 +320,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9054aea..369f451 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -172,7 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -212,31 +215,17 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } -/* - * Find an entry in the specified directory with the wanted name. - * It returns the page where the entry was found (as a parameter - res_page), - * and the entry itself. Page is returned mapped and unlocked. - * Entry is guaranteed to be valid. - */ -struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - const struct qstr *child, struct page **res_page) +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; - struct fscrypt_name fname; - int err; - - err = fscrypt_setup_filename(dir, child, 1, &fname); - if (err) { - *res_page = ERR_PTR(err); - return NULL; - } if (f2fs_has_inline_dentry(dir)) { *res_page = NULL; - de = find_in_inline_dir(dir, &fname, res_page); + de = find_in_inline_dir(dir, fname, res_page); goto out; } @@ -256,11 +245,35 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, for (level = 0; level < max_depth; level++) { *res_page = NULL; - de = find_in_level(dir, level, &fname, res_page); + de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; } out: + return de; +} + +/* + * Find an entry in the specified directory with the wanted name. + * It returns the page where the entry was found (as a parameter - res_page), + * and the entry itself. Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page) +{ + struct f2fs_dir_entry *de = NULL; + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); + return NULL; + } + + de = __f2fs_find_entry(dir, &fname, res_page); + fscrypt_free_filename(&fname); return de; } @@ -299,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -375,7 +388,8 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; @@ -400,7 +414,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; @@ -417,8 +431,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. @@ -451,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -496,7 +510,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, de->ino = cpu_to_le32(ino); set_de_type(de, mode); for (i = 0; i < slots; i++) { - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ if (i) (de + i)->name_len = 0; @@ -504,6 +518,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -530,7 +545,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) @@ -569,7 +584,8 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -599,6 +615,26 @@ fail: return err; } +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). @@ -607,24 +643,15 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; - struct qstr new_name; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - err = -EAGAIN; - if (f2fs_has_inline_dentry(dir)) - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (err == -EAGAIN) - err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode); + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); fscrypt_free_filename(&fname); - f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -634,7 +661,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -656,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -703,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) @@ -786,19 +813,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - kfree(de_name.name); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 14f5fe2..9e8de18 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -46,6 +46,8 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, FAULT_MAX, }; @@ -55,40 +57,8 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern struct f2fs_fault_info f2fs_fault; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type))) - -static inline bool time_to_inject(int type) -{ - if (!f2fs_fault.inject_rate) - return false; - if (type == FAULT_KMALLOC && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_BLOCK && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) - return false; - else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) - return false; - - atomic_inc(&f2fs_fault.inject_ops); - if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { - atomic_set(&f2fs_fault.inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); - return true; - } - return false; -} +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -158,7 +128,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ @@ -211,6 +181,13 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -645,6 +622,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -748,6 +726,7 @@ enum { SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; enum { @@ -765,7 +744,7 @@ struct f2fs_sb_info { struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int valid_super_block; /* valid super block no */ - int s_flag; /* flags for sbi */ + unsigned long s_flag; /* flags for sbi */ #ifdef CONFIG_F2FS_FS_ENCRYPTION u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; @@ -785,6 +764,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ @@ -892,8 +872,37 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + /* For write statistics. Suppose sector size is 512 bytes, * and the return value is in kbytes. s is of struct f2fs_sb_info. */ @@ -1034,17 +1043,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -1052,26 +1061,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -1110,8 +1150,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1151,7 +1191,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t diff; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_BLOCK)) + if (time_to_inject(sbi, FAULT_BLOCK)) return false; #endif /* @@ -1193,6 +1233,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } @@ -1243,6 +1287,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1376,7 +1425,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) return NULL; #endif if (!for_write) @@ -1804,7 +1853,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1827,10 +1876,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(inode->i_mode); } -static inline void *f2fs_kmalloc(size_t size, gfp_t flags) +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) return NULL; #endif return kmalloc(size, flags); @@ -1885,6 +1935,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); int update_inode(struct inode *, struct page *); int update_inode_page(struct inode *); @@ -1900,7 +1951,6 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, @@ -1910,10 +1960,12 @@ bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); @@ -1924,7 +1976,9 @@ int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); int f2fs_add_regular_entry(struct inode *, const struct qstr *, - struct inode *, nid_t, umode_t); + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -2010,9 +2064,9 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -2095,6 +2149,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c @@ -2123,13 +2181,14 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; - unsigned int valid_count, valid_node_count, valid_inode_count; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; @@ -2294,8 +2353,8 @@ bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 28f4f4c..c786507 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -135,7 +135,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -523,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -631,7 +631,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -680,7 +680,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int err; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); } } @@ -732,10 +732,7 @@ const struct inode_operations f2fs_file_inode_operations = { .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, #endif .fiemap = f2fs_fiemap, }; @@ -1395,7 +1392,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1454,7 +1451,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; @@ -1487,7 +1484,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); @@ -1954,7 +1951,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, * avoid defragment running in SSR mode when free section are allocated * intensively */ - if (has_not_enough_free_secs(sbi, sec_num)) { + if (has_not_enough_free_secs(sbi, 0, sec_num)) { err = -EAGAIN; goto out; } @@ -2085,6 +2082,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) return -EOPNOTSUPP; + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + inode_lock(src); if (src != dst) { if (!inode_trylock(dst)) { @@ -2136,8 +2140,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - ret = __exchange_data_block(src, dst, pos_in, - pos_out, len >> F2FS_BLKSIZE_BITS, false); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); if (!ret) { if (dst_max_i_size) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8f7fa32..6f14ee9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -47,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -96,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -270,7 +275,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost, last_victim; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); unsigned int nsearched = 0; @@ -280,7 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; @@ -423,10 +428,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -439,16 +444,24 @@ next_step: struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -469,10 +482,8 @@ next_step: stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } } /* @@ -706,16 +717,23 @@ next_step: struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -723,14 +741,14 @@ next_step: if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -756,7 +774,7 @@ next_step: continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -789,7 +807,7 @@ next_step: } } - if (++phase < 4) + if (++phase < 5) goto next_step; } @@ -815,7 +833,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int seg_freed = 0; + int sec_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -834,15 +852,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0) - continue; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); @@ -861,7 +880,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); - +next: f2fs_put_page(sum_page, 0); } @@ -871,22 +890,20 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); - if (gc_type == FG_GC) { - while (start_segno < end_segno) - if (get_valid_blocks(sbi, start_segno++, 1) == 0) - seg_freed++; - } + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; stat_inc_call_count(sbi->stat_info); - return seg_freed; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0, seg_freed; + int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -905,7 +922,7 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; /* * If there is no victim and no prefree segment but still not @@ -914,10 +931,14 @@ gc_more: */ if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) { - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0)) { - write_checkpoint(sbi, &cpc); + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } } @@ -925,20 +946,19 @@ gc_more: goto stop; ret = 0; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); - - if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ccea873..5f1a67f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -424,7 +424,7 @@ static int f2fs_add_inline_entries(struct inode *dir, ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; - err = f2fs_add_regular_entry(dir, &new_name, NULL, + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, ino, fake_mode); if (err) goto punch_dentry_pages; @@ -445,8 +445,8 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *backup_dentry; int err; - backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), - GFP_F2FS_ZERO); + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -488,17 +488,17 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -519,18 +519,21 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -563,13 +566,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9ac5efc..d736989 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include "f2fs.h" @@ -234,6 +235,20 @@ bad_inode: return ERR_PTR(ret); } +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -354,7 +369,7 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_EVICT_INODE)) + if (time_to_inject(sbi, FAULT_EVICT_INODE)) goto no_delete; #endif diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 73fa356..489fa0d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -91,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -177,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -449,7 +454,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -718,7 +723,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -772,7 +777,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -927,7 +932,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -942,7 +947,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); @@ -1010,7 +1015,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, struct fscrypt_str cstr = FSTR_INIT(NULL, 0); struct fscrypt_str pstr = FSTR_INIT(NULL, 0); struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; @@ -1025,7 +1029,6 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; /* Symlink is encrypted */ sd = (struct fscrypt_symlink_data *)caddr; @@ -1048,7 +1051,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1060,7 +1063,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); set_delayed_call(done, kfree_link, paddr); @@ -1077,10 +1080,7 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, #endif }; @@ -1093,17 +1093,14 @@ const struct inode_operations f2fs_dir_inode_operations = { .mkdir = f2fs_mkdir, .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, - .rename2 = f2fs_rename2, + .rename = f2fs_rename2, .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, #endif }; @@ -1113,10 +1110,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, #endif }; @@ -1126,9 +1120,6 @@ const struct inode_operations f2fs_special_inode_operations = { .get_acl = f2fs_get_acl, .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, - .removexattr = generic_removexattr, #endif }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f75d197..01177ec 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -54,8 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; - if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD) - res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -1314,6 +1312,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1387,7 +1386,10 @@ continue_unlock: unlock_page(page); f2fs_put_page(last_page, 0); break; + } else { + nwritten++; } + if (page == last_page) { f2fs_put_page(page, 0); marked = true; @@ -1409,6 +1411,9 @@ continue_unlock: unlock_page(last_page); goto retry; } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; } @@ -1418,6 +1423,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) struct pagevec pvec; int step = 0; int nwritten = 0; + int ret = 0; pagevec_init(&pvec, 0); @@ -1438,7 +1444,8 @@ next_step: if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } /* @@ -1489,6 +1496,8 @@ continue_unlock: if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); + else + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1506,14 +1515,17 @@ continue_unlock: step++; goto next_step; } - return nwritten; +out: + if (nwritten) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; - int ret2 = 0, ret = 0; + int ret2, ret = 0; pagevec_init(&pvec, 0); @@ -1542,10 +1554,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) - ret2 = -ENOSPC; - if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) - ret2 = -EIO; + ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; return ret; @@ -1672,6 +1681,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1838,7 +1850,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) @@ -2015,10 +2027,12 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - +retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); - if (!ipage) - return -ENOMEM; + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index fc76845..868bec6 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -229,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -259,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} - -static inline nid_t nid_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9e652d5..2fc84a9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -68,15 +68,17 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static struct fsync_inode_entry *add_fsync_inode(struct list_head *head, - struct inode *inode) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) { + struct inode *inode; struct fsync_inode_entry *entry; - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) - return NULL; + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); @@ -96,48 +98,41 @@ static int recover_dentry(struct inode *inode, struct page *ipage, struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; struct fsync_inode_entry *entry; int err = 0; + char *name; entry = get_fsync_inode(dir_list, pino); if (!entry) { - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } - - entry = add_fsync_inode(dir_list, dir); - if (!entry) { - err = -ENOMEM; - iput(dir); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); goto out; } } dir = entry->inode; - if (file_enc_name(inode)) - return 0; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; - - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -156,18 +151,24 @@ retry: } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_add_link(dir, &name, inode, + err = __f2fs_do_add_link(dir, &fname, inode, inode->i_ino, inode->i_mode); } + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); out: + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -223,9 +224,7 @@ static bool is_same_inode(struct inode *inode, struct page *ipage) static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct inode *inode; struct page *page = NULL; block_t blkaddr; int err = 0; @@ -242,7 +241,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) @@ -263,23 +262,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - - /* add this fsync inode to the list */ - entry = add_fsync_inode(head, inode); - if (!entry) { - err = -ENOMEM; - iput(inode); - break; - } } entry->blkaddr = blkaddr; @@ -363,7 +354,7 @@ got_it: if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { @@ -431,10 +422,15 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } f2fs_wait_on_page_writeback(dn.node_page, NODE, true); @@ -485,11 +481,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (err) goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, @@ -514,7 +515,6 @@ out: static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; @@ -534,7 +534,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } @@ -626,38 +626,20 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; - - if (test_opt(sbi, LFS)) { - update_meta_page(sbi, NULL, blkaddr); - invalidate = true; - } else if (discard_next_dnode(sbi, blkaddr)) { - invalidate = true; - } - - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); err = write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); } - destroy_fsync_dnodes(&dir_list); kmem_cache_destroy(fsync_entry_slab); return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a46296f..fc886f0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -344,6 +345,11 @@ int commit_inmem_pages(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + if (!need) return; @@ -355,7 +361,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -580,6 +586,74 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio = NULL; + int err; + + err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, + &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + } + + return err; +} + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -597,29 +671,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); -} - -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -EOPNOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; + return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -660,7 +712,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -719,11 +771,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -772,6 +827,8 @@ skip: SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -818,12 +875,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -1202,7 +1261,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1277,6 +1336,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1301,6 +1366,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) mutex_lock(&sbi->gc_mutex); err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); @@ -1391,7 +1460,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1589,11 +1658,9 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { f2fs_wait_on_page_writeback(cpage, DATA, true); @@ -1739,7 +1806,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1836,7 +1903,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -2127,12 +2194,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2239,6 +2310,8 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; @@ -2251,41 +2324,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - down_read(&curseg->journal_rwsem); - for (i = 0; i < sits_in_cursum(journal); i++) { - if (le32_to_cpu(segno_in_journal(journal, i)) - == start) { - sit = sit_in_journal(journal, i); - up_read(&curseg->journal_rwsem); - goto got_it; - } - } - up_read(&curseg->journal_rwsem); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2427,6 +2517,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2570,10 +2661,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2583,7 +2679,9 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; @@ -2592,6 +2690,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b33f73e..fecb856 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -479,7 +479,8 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); @@ -489,8 +490,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -587,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7f863a6..6132b4c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -40,7 +40,6 @@ static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION -struct f2fs_fault_info f2fs_fault; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", @@ -50,16 +49,21 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", }; -static void f2fs_build_fault_attr(unsigned int rate) +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) { + struct f2fs_fault_info *ffi = &sbi->fault_info; + if (rate) { - atomic_set(&f2fs_fault.inject_ops, 0); - f2fs_fault.inject_rate = rate; - f2fs_fault.inject_type = (1 << FAULT_MAX) - 1; + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; } else { - memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info)); + memset(ffi, 0, sizeof(struct f2fs_fault_info)); } } #endif @@ -87,6 +91,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, Opt_noflush_merge, Opt_nobarrier, @@ -118,6 +123,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, @@ -167,7 +173,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&f2fs_fault; + return (unsigned char *)&sbi->fault_info; #endif return NULL; } @@ -312,6 +318,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -327,22 +337,6 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; -#ifdef CONFIG_F2FS_FAULT_INJECTION -/* sysfs for f2fs fault injection */ -static struct kobject f2fs_fault_inject; - -static struct attribute *f2fs_fault_attrs[] = { - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), - NULL -}; - -static struct kobj_type f2fs_fault_ktype = { - .default_attrs = f2fs_fault_attrs, - .sysfs_ops = &f2fs_attr_ops, -}; -#endif - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -370,10 +364,6 @@ static int parse_options(struct super_block *sb, char *options) char *p, *name; int arg = 0; -#ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(0); -#endif - if (!options) return 0; @@ -488,6 +478,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; @@ -533,7 +526,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(arg); + f2fs_build_fault_attr(sbi, arg); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -730,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -878,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -946,7 +941,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%d|%-3u|", se->type, get_valid_blocks(sbi, i, 1)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, "%x ", se->cur_valid_map[j]); + seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); } return 0; @@ -975,6 +970,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); @@ -991,6 +987,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -1001,6 +1001,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -1096,6 +1099,9 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -1469,6 +1475,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); #ifdef CONFIG_F2FS_FS_ENCRYPTION memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, @@ -1810,7 +1817,7 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1818,6 +1825,9 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); + if (!retry) + goto skip_recovery; + err = recover_fsync_data(sbi, false); if (err < 0) { need_fsck = true; @@ -1835,7 +1845,7 @@ try_onemore: goto free_kobj; } } - +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1879,7 +1889,9 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1978,16 +1990,6 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } -#ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_fault_inject.kset = f2fs_kset; - f2fs_build_fault_attr(0); - err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype, - NULL, "fault_injection"); - if (err) { - f2fs_fault_inject.kset = NULL; - goto free_kset; - } -#endif err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_kset; @@ -2006,10 +2008,6 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_kset: -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (f2fs_fault_inject.kset) - kobject_put(&f2fs_fault_inject); -#endif kset_unregister(f2fs_kset); free_extent_cache: destroy_extent_cache(); @@ -2031,9 +2029,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); -#ifdef CONFIG_F2FS_FAULT_INJECTION - kobject_put(&f2fs_fault_inject); -#endif kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c8898b5..3e1c028 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -217,18 +217,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -239,8 +241,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -254,8 +258,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -269,10 +275,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -366,9 +373,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -402,9 +409,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -463,9 +470,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -541,13 +548,15 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 663e428..81cecbe 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1071,7 +1071,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) } } - dir->i_mtime = dir->i_atime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_atime = current_time(dir); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else diff --git a/fs/fat/file.c b/fs/fat/file.c index f701856..3d04b12 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -63,7 +63,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) /* Equivalent to a chmod() */ ia.ia_valid = ATTR_MODE | ATTR_CTIME; - ia.ia_ctime = current_fs_time(inode->i_sb); + ia.ia_ctime = current_time(inode); if (is_dir) ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); else { @@ -194,7 +194,7 @@ static int fat_cont_expand(struct inode *inode, loff_t size) if (err) goto out; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); if (IS_SYNC(inode)) { int err2; @@ -297,7 +297,7 @@ static int fat_free(struct inode *inode, int skip) MSDOS_I(inode)->i_logstart = 0; } MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); if (wait) { err = fat_sync_inode(inode); if (err) { @@ -450,7 +450,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index da04c02..338d2f7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -237,7 +237,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping, if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = current_time(inode); MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 664655b..7d6a105 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -283,7 +283,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; } - ts = CURRENT_TIME_SEC; + ts = current_time(dir); err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); if (err) goto out; @@ -330,7 +330,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -364,7 +364,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ts = CURRENT_TIME_SEC; + ts = current_time(dir); cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; @@ -416,7 +416,7 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); fat_detach(inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -481,7 +481,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, mark_inode_dirty(old_inode); old_dir->i_version++; - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (IS_DIRSYNC(old_dir)) (void)fat_sync_inode(old_dir); else @@ -490,7 +490,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, } } - ts = CURRENT_TIME_SEC; + ts = current_time(old_inode); if (new_inode) { if (err) goto out; @@ -596,12 +596,16 @@ error_inode: /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct super_block *sb = old_dir->i_sb; unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME]; int err, is_hid; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + mutex_lock(&MSDOS_SB(sb)->s_lock); err = msdos_format_name(old_dentry->d_name.name, diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 92b7363..6a7152d 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -21,6 +21,17 @@ #include <linux/namei.h> #include "fat.h" +static inline unsigned long vfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void vfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + /* * If new entry was created in the parent, it could create the 8.3 * alias (the shortname of logname). So, the parent may have the @@ -33,7 +44,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != d_inode(dentry->d_parent)->i_version) + if (vfat_d_version(dentry) != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -759,7 +770,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); if (!inode) - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -777,7 +788,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, mutex_lock(&MSDOS_SB(sb)->s_lock); - ts = CURRENT_TIME_SEC; + ts = current_time(dir); err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); if (err) goto out; @@ -821,9 +832,9 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = current_time(inode); fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -847,9 +858,9 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; clear_nlink(inode); - inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = current_time(inode); fat_detach(inode); - dentry->d_time = dir->i_version; + vfat_d_version_set(dentry, dir->i_version); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -866,7 +877,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mutex_lock(&MSDOS_SB(sb)->s_lock); - ts = CURRENT_TIME_SEC; + ts = current_time(dir); cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; @@ -903,7 +914,8 @@ out: } static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct buffer_head *dotdot_bh; struct msdos_dir_entry *dotdot_de; @@ -914,6 +926,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; old_inode = d_inode(old_dentry); new_inode = d_inode(new_dentry); @@ -931,7 +946,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, } } - ts = CURRENT_TIME_SEC; + ts = current_time(old_dir); if (new_inode) { if (is_dir) { err = fat_dir_empty(new_inode); @@ -23,12 +23,12 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> -int sysctl_nr_open __read_mostly = 1024*1024; -int sysctl_nr_open_min = BITS_PER_LONG; +unsigned int sysctl_nr_open __read_mostly = 1024*1024; +unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) -int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; +unsigned int sysctl_nr_open_max = + __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -163,7 +163,7 @@ out: * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_fdtable(struct files_struct *files, int nr) +static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -208,7 +208,7 @@ static int expand_fdtable(struct files_struct *files, int nr) * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ -static int expand_files(struct files_struct *files, int nr) +static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { @@ -243,12 +243,12 @@ repeat: return expanded; } -static inline void __set_close_on_exec(int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } -static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) +static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); @@ -268,10 +268,10 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } -static int count_open_files(struct fdtable *fdt) +static unsigned int count_open_files(struct fdtable *fdt) { - int size = fdt->max_fds; - int i; + unsigned int size = fdt->max_fds; + unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { @@ -291,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, i; + unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files) * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); - int i, j = 0; + unsigned int i, j = 0; for (;;) { unsigned long set; @@ -477,11 +477,11 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; -static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { - unsigned long maxfd = fdt->max_fds; - unsigned long maxbit = maxfd / BITS_PER_LONG; - unsigned long bitbit = start / BITS_PER_LONG; + unsigned int maxfd = fdt->max_fds; + unsigned int maxbit = maxfd / BITS_PER_LONG; + unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 1b2f6c2..76f09ce 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -1,5 +1,6 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" + select FS_POSIX_ACL help With FUSE it is possible to implement a fully functional filesystem in a userspace program. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index e95eeb4..60da84a 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c new file mode 100644 index 0000000..ec85765 --- /dev/null +++ b/fs/fuse/acl.c @@ -0,0 +1,99 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> + +struct posix_acl *fuse_get_acl(struct inode *inode, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + int size; + const char *name; + void *value = NULL; + struct posix_acl *acl; + + if (!fc->posix_acl || fc->no_getxattr) + return NULL; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return ERR_PTR(-EOPNOTSUPP); + + value = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = fuse_getxattr(inode, name, value, PAGE_SIZE); + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if ((size == 0) || (size == -ENODATA) || + (size == -EOPNOTSUPP && fc->no_getxattr)) + acl = NULL; + else if (size == -ERANGE) + acl = ERR_PTR(-E2BIG); + else + acl = ERR_PTR(size); + + kfree(value); + return acl; +} + +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + const char *name; + int ret; + + if (!fc->posix_acl || fc->no_setxattr) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + name = XATTR_NAME_POSIX_ACL_ACCESS; + else if (type == ACL_TYPE_DEFAULT) + name = XATTR_NAME_POSIX_ACL_DEFAULT; + else + return -EINVAL; + + if (acl) { + /* + * Fuse userspace is responsible for updating access + * permissions in the inode, if needed. fuse_setxattr + * invalidates the inode attributes, which will force + * them to be refreshed the next time they are used, + * and it also updates i_ctime. + */ + size_t size = posix_acl_xattr_size(acl->a_count); + void *value; + + if (size > PAGE_SIZE) + return -E2BIG; + + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) { + kfree(value); + return ret; + } + + ret = fuse_setxattr(inode, name, value, size, 0); + kfree(value); + } else { + ret = fuse_removexattr(inode, name); + } + forget_all_cached_acls(inode); + fuse_invalidate_attr(inode); + + return ret; +} diff --git a/fs/fuse/control.c b/fs/fuse/control.c index f863ac6..6e22748 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -220,7 +220,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a94d2ed..70ea57c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -728,7 +728,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->pipebufs; if (!cs->write) { - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -767,7 +767,6 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->len = err; cs->offset = off; cs->pg = page; - cs->offset = off; iov_iter_advance(cs->iter, err); } @@ -828,7 +827,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) fuse_copy_finish(cs); - err = buf->ops->confirm(cs->pipe, buf); + err = pipe_buf_confirm(cs->pipe, buf); if (err) return err; @@ -841,7 +840,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (cs->len != PAGE_SIZE) goto out_fallback; - if (buf->ops->steal(cs->pipe, buf) != 0) + if (pipe_buf_steal(cs->pipe, buf) != 0) goto out_fallback; newpage = buf->page; @@ -1342,9 +1341,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - int ret; + int total, ret; int page_nr = 0; - int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); @@ -1363,52 +1361,23 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - ret = 0; - pipe_lock(pipe); - - if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; - goto out_unlock; - } - if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { ret = -EIO; - goto out_unlock; + goto out; } - while (page_nr < cs.nr_segs) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; - - buf->page = bufs[page_nr].page; - buf->offset = bufs[page_nr].offset; - buf->len = bufs[page_nr].len; + for (ret = total = 0; page_nr < cs.nr_segs; total += ret) { /* * Need to be careful about this. Having buf->ops in module * code can Oops if the buffer persists after module unload. */ - buf->ops = &nosteal_pipe_buf_ops; - - pipe->nrbufs++; - page_nr++; - ret += buf->len; - - if (pipe->files) - do_wakeup = 1; - } - -out_unlock: - pipe_unlock(pipe); - - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + bufs[page_nr].ops = &nosteal_pipe_buf_ops; + ret = add_to_pipe(pipe, &bufs[page_nr++]); + if (unlikely(ret < 0)) + break; } - + if (total) + ret = total; out: for (; page_nr < cs.nr_segs; page_nr++) put_page(bufs[page_nr].page); @@ -1993,7 +1962,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - ibuf->ops->get(pipe, ibuf); + pipe_buf_get(pipe, ibuf); *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2015,10 +1984,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); - for (idx = 0; idx < nbuf; idx++) { - struct pipe_buffer *buf = &bufs[idx]; - buf->ops->release(pipe, buf); - } + for (idx = 0; idx < nbuf; idx++) + pipe_buf_release(pipe, &bufs[idx]); + out: kfree(bufs); return ret; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c47b778..6a4d0e5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -13,6 +13,8 @@ #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { @@ -37,47 +39,39 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 +union fuse_dentry { + u64 time; + struct rcu_head rcu; +}; + static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { - entry->d_time = time; + ((union fuse_dentry *) entry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(struct dentry *entry) { - return entry->d_time; -} -#else -/* - * On 32 bit archs store the high 32 bits of time in d_fsdata - */ -static void fuse_dentry_settime(struct dentry *entry, u64 time) -{ - entry->d_time = time; - entry->d_fsdata = (void *) (unsigned long) (time >> 32); -} - -static u64 fuse_dentry_time(struct dentry *entry) -{ - return (u64) entry->d_time + - ((u64) (unsigned long) entry->d_fsdata << 32); + return ((union fuse_dentry *) entry->d_fsdata)->time; } -#endif /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in - * dentry->d_time and fuse_inode->i_time respectively. + * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(unsigned long sec, unsigned long nsec) +static u64 time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { - struct timespec ts = {sec, nsec}; - return get_jiffies_64() + timespec_to_jiffies(&ts); + struct timespec64 ts = { + sec, + max_t(u32, nsec, NSEC_PER_SEC - 1) + }; + + return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } @@ -243,6 +237,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; + forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, entry_attr_timeout(&outarg), attr_version); @@ -272,8 +267,23 @@ static int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static int fuse_dentry_init(struct dentry *dentry) +{ + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + + return dentry->d_fsdata ? 0 : -ENOMEM; +} +static void fuse_dentry_release(struct dentry *dentry) +{ + union fuse_dentry *fd = dentry->d_fsdata; + + kfree_rcu(fd, rcu); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_init = fuse_dentry_init, + .d_release = fuse_dentry_release, }; int fuse_valid_type(int m) @@ -634,10 +644,10 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, &args, dir, entry, S_IFLNK); } -static inline void fuse_update_ctime(struct inode *inode) +void fuse_update_ctime(struct inode *inode) { if (!IS_NOCMTIME(inode)) { - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); } } @@ -917,6 +927,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (time_before64(fi->i_time, get_jiffies_64())) { r = true; + forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else { r = false; @@ -1017,7 +1028,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) return 1; cred = current_cred(); @@ -1064,6 +1075,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; + forget_all_cached_acls(inode); return fuse_do_getattr(inode, NULL, NULL); } @@ -1092,7 +1104,7 @@ static int fuse_permission(struct inode *inode, int mask) /* * If attributes are needed, refresh them before proceeding */ - if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || + if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -1105,7 +1117,7 @@ static int fuse_permission(struct inode *inode, int mask) } } - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { + if (fc->default_permissions) { err = generic_permission(inode, mask); /* If permission is denied, try to refresh file @@ -1233,6 +1245,7 @@ retry: fi->nlookup++; spin_unlock(&fc->lock); + forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), attr_version); @@ -1591,9 +1604,10 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct inode *inode, struct iattr *attr, +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); @@ -1605,10 +1619,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1702,172 +1716,75 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); - - if (!fuse_allow_current_process(get_fuse_conn(inode))) - return -EACCES; - - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); -} - -static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, - struct kstat *stat) -{ - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); -} - -static int fuse_setxattr(struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_setxattr_in inarg; - int err; - - if (fc->no_setxattr) - return -EOPNOTSUPP; + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_setxattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); + /* + * The only sane way to reliably kill suid/sgid is to do it in + * the userspace filesystem + * + * This should be done on write(), truncate() and chown(). + */ + if (!fc->handle_killpriv) { + int kill; + + /* + * ia_mode calculation may have used stale i_mode. + * Refresh and recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } } - return err; -} - -static ssize_t fuse_getxattr(struct dentry *entry, struct inode *inode, - const char *name, void *value, size_t size) -{ - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; + if (!attr->ia_valid) + return 0; - if (fc->no_getxattr) - return -EOPNOTSUPP; + ret = fuse_do_setattr(entry, attr, file); + if (!ret) { + /* + * If filesystem supports acls it may have updated acl xattrs in + * the filesystem, so forget cached acls for the inode. + */ + if (fc->posix_acl) + forget_all_cached_acls(inode); - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_getxattr = 1; - ret = -EOPNOTSUPP; + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); } return ret; } -static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, + struct kstat *stat) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - struct fuse_getxattr_in inarg; - struct fuse_getxattr_out outarg; - ssize_t ret; if (!fuse_allow_current_process(fc)) return -EACCES; - if (fc->no_listxattr) - return -EOPNOTSUPP; - - memset(&inarg, 0, sizeof(inarg)); - inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - /* This is really two different operations rolled into one */ - args.out.numargs = 1; - if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; - } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; - } - ret = fuse_simple_request(fc, &args); - if (!ret && !size) - ret = outarg.size; - if (ret == -ENOSYS) { - fc->no_listxattr = 1; - ret = -EOPNOTSUPP; - } - return ret; -} - -static int fuse_removexattr(struct dentry *entry, const char *name) -{ - struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - int err; - - if (fc->no_removexattr) - return -EOPNOTSUPP; - - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; - err = fuse_simple_request(fc, &args); - if (err == -ENOSYS) { - fc->no_removexattr = 1; - err = -EOPNOTSUPP; - } - if (!err) { - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); - } - return err; + return fuse_update_attributes(inode, stat, NULL, NULL); } static const struct inode_operations fuse_dir_inode_operations = { @@ -1876,7 +1793,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, - .rename2 = fuse_rename2, + .rename = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, @@ -1884,10 +1801,9 @@ static const struct inode_operations fuse_dir_inode_operations = { .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct file_operations fuse_dir_operations = { @@ -1905,10 +1821,9 @@ static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, + .get_acl = fuse_get_acl, + .set_acl = fuse_set_acl, }; static const struct inode_operations fuse_symlink_inode_operations = { @@ -1916,10 +1831,7 @@ static const struct inode_operations fuse_symlink_inode_operations = { .get_link = fuse_get_link, .readlink = generic_readlink, .getattr = fuse_getattr, - .setxattr = fuse_setxattr, - .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, - .removexattr = fuse_removexattr, }; void fuse_init_common(struct inode *inode) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3988b43..abc66a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2326,49 +2326,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) return retval; } -static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, - unsigned int nr_segs, size_t bytes, bool to_user) -{ - struct iov_iter ii; - int page_idx = 0; - - if (!bytes) - return 0; - - iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes); - - while (iov_iter_count(&ii)) { - struct page *page = pages[page_idx++]; - size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); - void *kaddr; - - kaddr = kmap(page); - - while (todo) { - char __user *uaddr = ii.iov->iov_base + ii.iov_offset; - size_t iov_len = ii.iov->iov_len - ii.iov_offset; - size_t copy = min(todo, iov_len); - size_t left; - - if (!to_user) - left = copy_from_user(kaddr, uaddr, copy); - else - left = copy_to_user(uaddr, kaddr, copy); - - if (unlikely(left)) - return -EFAULT; - - iov_iter_advance(&ii, copy); - todo -= copy; - kaddr += copy; - } - - kunmap(page); - } - - return 0; -} - /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -2520,8 +2477,9 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred; - int err; + size_t in_size, out_size, transferred, c; + int err, i; + struct iov_iter ii; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2603,10 +2561,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, req->in.args[1].size = in_size; req->in.argpages = 1; - err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, - false); - if (err) - goto out; + err = -EFAULT; + iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } } req->out.numargs = 2; @@ -2672,7 +2633,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, if (transferred > inarg.out_size) goto out; - err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + err = -EFAULT; + iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { + c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + if (c != PAGE_SIZE && iov_iter_count(&ii)) + goto out; + } + err = 0; out: if (req) fuse_put_request(fc, req); @@ -2842,7 +2810,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(inode, &attr, file); + fuse_do_setattr(file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(loff_t off) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d98d8cc..0dfbb13 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -23,6 +23,7 @@ #include <linux/poll.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/xattr.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -36,15 +37,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem - module will check permissions based on the file mode. Otherwise no - permission checking is done in the kernel */ -#define FUSE_DEFAULT_PERMISSIONS (1 << 0) - -/** If the FUSE_ALLOW_OTHER flag is given, then not only the user - doing the mount will be allowed to access the filesystem */ -#define FUSE_ALLOW_OTHER (1 << 1) - /** Number of page pointers embedded in fuse_req */ #define FUSE_REQ_INLINE_PAGES 1 @@ -469,9 +461,6 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; - /** The fuse mount flags for this mount */ - unsigned flags; - /** Maximum read size */ unsigned max_read; @@ -547,6 +536,9 @@ struct fuse_conn { /** allow parallel lookups and readdir (default is serialized) */ unsigned parallel_dirops:1; + /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ + unsigned handle_killpriv:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -624,6 +616,15 @@ struct fuse_conn { /** Is lseek not implemented by fs? */ unsigned no_lseek:1; + /** Does the filesystem support posix acls? */ + unsigned posix_acl:1; + + /** Check permissions based on the file mode or not? */ + unsigned default_permissions:1; + + /** Allow other than the mounter user to access the filesystem ? */ + unsigned allow_other:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -902,6 +903,8 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_update_ctime(struct inode *inode); + int fuse_update_attributes(struct inode *inode, struct kstat *stat, struct file *file, bool *refreshed); @@ -958,7 +961,7 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct inode *inode, struct iattr *attr, +int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); @@ -966,4 +969,17 @@ void fuse_set_initialized(struct fuse_conn *fc); void fuse_unlock_inode(struct inode *inode); void fuse_lock_inode(struct inode *inode); +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags); +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size); +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); +int fuse_removexattr(struct inode *inode, const char *name); +extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler *fuse_acl_xattr_handlers[]; + +struct posix_acl; +struct posix_acl *fuse_get_acl(struct inode *inode, int type); +int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4e05b51..1714109 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -20,6 +20,7 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -66,7 +67,8 @@ struct fuse_mount_data { unsigned rootmode_present:1; unsigned user_id_present:1; unsigned group_id_present:1; - unsigned flags; + unsigned default_permissions:1; + unsigned allow_other:1; unsigned max_read; unsigned blksize; }; @@ -192,7 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, * check in may_delete(). */ fi->orig_i_mode = inode->i_mode; - if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) + if (!fc->default_permissions) inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; @@ -340,6 +342,7 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return -ENOENT; fuse_invalidate_attr(inode); + forget_all_cached_acls(inode); if (offset >= 0) { pg_start = offset >> PAGE_SHIFT; if (len <= 0) @@ -532,11 +535,11 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) break; case OPT_DEFAULT_PERMISSIONS: - d->flags |= FUSE_DEFAULT_PERMISSIONS; + d->default_permissions = 1; break; case OPT_ALLOW_OTHER: - d->flags |= FUSE_ALLOW_OTHER; + d->allow_other = 1; break; case OPT_MAX_READ: @@ -570,9 +573,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); - if (fc->flags & FUSE_DEFAULT_PERMISSIONS) + if (fc->default_permissions) seq_puts(m, ",default_permissions"); - if (fc->flags & FUSE_ALLOW_OTHER) + if (fc->allow_other) seq_puts(m, ",allow_other"); if (fc->max_read != ~0) seq_printf(m, ",max_read=%u", fc->max_read); @@ -910,8 +913,15 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->writeback_cache = 1; if (arg->flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; + if (arg->flags & FUSE_HANDLE_KILLPRIV) + fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; + if ((arg->flags & FUSE_POSIX_ACL)) { + fc->default_permissions = 1; + fc->posix_acl = 1; + fc->sb->s_xattr = fuse_acl_xattr_handlers; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -941,7 +951,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1071,6 +1081,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) } sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; @@ -1109,7 +1120,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->flags = d.flags; + fc->default_permissions = d.default_permissions; + fc->allow_other = d.allow_other; fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c new file mode 100644 index 0000000..3caac46 --- /dev/null +++ b/fs/fuse/xattr.c @@ -0,0 +1,211 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2016 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU GPL. + * See the file COPYING. + */ + +#include "fuse_i.h" + +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> + +int fuse_setxattr(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_setxattr_in inarg; + int err; + + if (fc->no_setxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + inarg.flags = flags; + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_setxattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (fc->no_getxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + +ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + struct fuse_getxattr_in inarg; + struct fuse_getxattr_out outarg; + ssize_t ret; + + if (!fuse_allow_current_process(fc)) + return -EACCES; + + if (fc->no_listxattr) + return -EOPNOTSUPP; + + memset(&inarg, 0, sizeof(inarg)); + inarg.size = size; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + /* This is really two different operations rolled into one */ + args.out.numargs = 1; + if (size) { + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; + } else { + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; + } + return ret; +} + +int fuse_removexattr(struct inode *inode, const char *name) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + int err; + + if (fc->no_removexattr) + return -EOPNOTSUPP; + + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_removexattr = 1; + err = -EOPNOTSUPP; + } + if (!err) { + fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } + return err; +} + +static int fuse_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return fuse_getxattr(inode, name, value, size); +} + +static int fuse_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (!value) + return fuse_removexattr(inode, name); + + return fuse_setxattr(inode, name, value, size, flags); +} + +static const struct xattr_handler fuse_xattr_handler = { + .prefix = "", + .get = fuse_xattr_get, + .set = fuse_xattr_set, +}; + +const struct xattr_handler *fuse_xattr_handlers[] = { + &fuse_xattr_handler, + NULL +}; + +const struct xattr_handler *fuse_acl_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 363ba9e..2524807 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -92,17 +92,11 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - - if (error == 0) - acl = NULL; - - if (mode != inode->i_mode) { - inode->i_mode = mode; + if (mode != inode->i_mode) mark_inode_dirty(inode); - } } if (acl) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 82df368..5a6f52e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -187,7 +187,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w ClearPageChecked(page); if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + BIT(BH_Dirty)|BIT(BH_Uptodate)); } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } @@ -1147,6 +1147,16 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) if (!page_has_buffers(page)) return 0; + /* + * From xfs_vm_releasepage: mm accommodates an old ext3 case where + * clean pages might not have had the dirty bit cleared. Thus, it can + * send actual dirty pages to ->releasepage() via shrink_active_list(). + * + * As a workaround, we skip pages that contain dirty buffers below. + * Once ->releasepage isn't called on dirty pages anymore, we can warn + * on dirty buffers like we used to here again. + */ + gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); @@ -1156,8 +1166,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd && bd->bd_tr) goto cannot_release; - if (buffer_pinned(bh) || buffer_dirty(bh)) - goto not_possible; + if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) + goto cannot_release; bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); @@ -1180,9 +1190,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); -not_possible: /* Should never happen */ - WARN_ON(buffer_dirty(bh)); - WARN_ON(buffer_pinned(bh)); cannot_release: spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 6e2bec1..fc5da4c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -82,8 +82,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, } if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, - (1 << BH_Uptodate)); + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); bh = page_buffers(page); @@ -690,7 +690,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi BUG_ON(!dblock); BUG_ON(!new); - bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5)); + bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5)); ret = gfs2_block_map(inode, lblock, &bh, create); *extlen = bh.b_size >> inode->i_blkbits; *dblock = bh.b_blocknr; @@ -836,7 +836,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); @@ -1063,7 +1063,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) } i_size_write(inode, newsize); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); if (journaled) @@ -1142,7 +1142,7 @@ static int trunc_end(struct gfs2_inode *ip) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); @@ -1252,7 +1252,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; i_size_write(inode, size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index fcb59b2..3cdde5f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -135,7 +135,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -233,7 +233,7 @@ out: if (ip->i_inode.i_size < offset + copied) i_size_write(&ip->i_inode, offset + copied); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -351,7 +351,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) if (hc) return hc; - hsize = 1 << ip->i_depth; + hsize = BIT(ip->i_depth); hsize *= sizeof(__be64); if (hsize != i_size_read(&ip->i_inode)) { gfs2_consist_inode(ip); @@ -819,8 +819,8 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; - unsigned hsize = 1 << ip->i_depth; - unsigned index; + unsigned int hsize = BIT(ip->i_depth); + unsigned int index; u64 ln; if (hsize * sizeof(u64) != i_size_read(inode)) { gfs2_consist_inode(ip); @@ -872,7 +872,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "" }; - struct timespec tv = CURRENT_TIME; + struct timespec tv = current_time(inode); error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -932,7 +932,7 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_entries < (1 << 16)); + gfs2_assert(sdp, dip->i_entries < BIT(16)); leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -1041,7 +1041,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) bn = nbh->b_blocknr; /* Compute the start and len of leaf pointers in the hash table. */ - len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); half_len = len >> 1; if (!half_len) { pr_warn("i_depth %u lf_depth %u index %u\n", @@ -1163,7 +1163,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int x; int error = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hsize_bytes = hsize * sizeof(__be64); hc = gfs2_dir_get_hash_table(dip); @@ -1539,7 +1539,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, int error = 0; unsigned depth = 0; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); hash = gfs2_dir_offset2hash(ctx->pos); index = hash >> (32 - dip->i_depth); @@ -1558,7 +1558,7 @@ static int dir_e_read(struct inode *inode, struct dir_context *ctx, if (error) break; - len = 1 << (dip->i_depth - depth); + len = BIT(dip->i_depth - depth); index = (index & ~(len - 1)) + len; } @@ -1816,7 +1816,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); - tv = CURRENT_TIME; + tv = current_time(&ip->i_inode); if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); @@ -1878,7 +1878,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec tv = CURRENT_TIME; + struct timespec tv = current_time(&dip->i_inode); /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1960,7 +1960,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_trans_add_meta(dip->i_gl, bh); } - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode); gfs2_dinode_out(dip, bh->b_data); brelse(bh); return 0; @@ -2113,7 +2113,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) u64 leaf_no; int error = 0, last; - hsize = 1 << dip->i_depth; + hsize = BIT(dip->i_depth); lp = gfs2_dir_get_hash_table(dip); if (IS_ERR(lp)) @@ -2126,7 +2126,7 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) if (error) goto out; leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth)); next_index = (index & ~(len - 1)) + len; last = ((next_index >= hsize) ? 1 : 0); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 320e65e..e23ff70 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -395,9 +395,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); - /* Update file times before taking page lock */ - file_update_time(vma->vm_file); - ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -409,6 +406,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_uninit; + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -954,30 +954,6 @@ out_uninit: return ret; } -static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct inode *inode = in->f_mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - inode_lock(inode); - - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - if (ret) { - inode_unlock(inode); - return ret; - } - - gfs2_glock_dq_uninit(&gh); - inode_unlock(inode); - - return generic_file_splice_read(in, ppos, pipe, len, flags); -} - - static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1140,7 +1116,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1168,7 +1144,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = gfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 3a90b2b..14cbf60 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -69,7 +69,7 @@ static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 -#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, @@ -1781,7 +1781,13 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker); + if (ret) { + destroy_workqueue(gfs2_delete_workqueue); + destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); + return ret; + } return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e4da0ec..fe3f849 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -187,6 +187,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } gfs2_set_iop(inode); + + inode->i_atime.tv_sec = 0; + inode->i_atime.tv_nsec = 0; + unlock_new_inode(inode); } @@ -652,7 +656,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); @@ -979,7 +983,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); @@ -1063,7 +1067,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, return error; ip->i_entries = 0; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else @@ -1326,7 +1330,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) return error; - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1932,7 +1936,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) goto out; @@ -2036,10 +2040,7 @@ const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, @@ -2054,14 +2055,11 @@ const struct inode_operations gfs2_dir_iops = { .mkdir = gfs2_mkdir, .rmdir = gfs2_unlink, .mknod = gfs2_mknod, - .rename2 = gfs2_rename2, + .rename = gfs2_rename2, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, .set_acl = gfs2_set_acl, @@ -2074,10 +2072,7 @@ const struct inode_operations gfs2_symlink_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = gfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 7710dfd..aace8ce 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -85,7 +85,7 @@ static inline int gfs2_check_internal_file_size(struct inode *inode, u64 size = i_size_read(inode); if (size < minsize || size > maxsize) goto err; - if (size & ((1 << inode->i_blkbits) - 1)) + if (size & (BIT(inode->i_blkbits) - 1)) goto err; return 0; err: diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 74fd013..67d1fc4 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -145,7 +145,9 @@ static int __init init_gfs2_fs(void) if (!gfs2_qadata_cachep) goto fail; - register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker); + if (error) + goto fail; error = register_filesystem(&gfs2_fs_type); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 950b8be..373639a5 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -216,23 +216,26 @@ static void gfs2_meta_read_endio(struct bio *bio) static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], int num) { - struct buffer_head *bh = bhs[0]; - struct bio *bio; - int i; - - if (!num) - return; - - bio = bio_alloc(GFP_NOIO, num); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - for (i = 0; i < num; i++) { - bh = bhs[i]; - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + while (num > 0) { + struct buffer_head *bh = *bhs; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + while (num > 0) { + bh = *bhs; + if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + BUG_ON(bio->bi_iter.bi_size == 0); + break; + } + bhs++; + num--; + } + bio->bi_end_io = gfs2_meta_read_endio; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } - bio->bi_end_io = gfs2_meta_read_endio; - bio_set_op_attrs(bio, op, op_flags); - submit_bio(bio); } /** diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ef1e182..ff72ac6 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -58,7 +58,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) gt->gt_quota_scale_num = 1; gt->gt_quota_scale_den = 1; gt->gt_new_files_jdata = 0; - gt->gt_max_readahead = 1 << 18; + gt->gt_max_readahead = BIT(18); gt->gt_complain_secs = 10; } @@ -284,7 +284,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - @@ -302,7 +302,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) /* Compute maximum reservation required to add a entry to a directory */ - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), sdp->sd_jbsize); ind_blocks = 0; @@ -1089,7 +1089,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 77930ca..c2ca956 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,7 +75,7 @@ #include "util.h" #define GFS2_QD_HASH_SHIFT 12 -#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ @@ -384,7 +384,7 @@ static int bh_get(struct gfs2_quota_data *qd) block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; - bh_map.b_size = 1 << ip->i_inode.i_blkbits; + bh_map.b_size = BIT(ip->i_inode.i_blkbits); error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; @@ -854,7 +854,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); - inode->i_mtime = inode->i_atime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = current_time(inode); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3a7e60b..e3ee387 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -359,7 +359,7 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u64 size = i_size_read(jd->jd_inode); - if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30)) + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) return -EIO; jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 3a28535..a4a5770 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -775,7 +775,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -910,7 +910,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1133,7 +1133,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_ctime = current_time(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index d9a8691..0933600 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -13,9 +13,13 @@ #include "hfs_fs.h" #include "btree.h" -int hfs_setxattr(struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) +enum hfs_xattr_type { + HFS_TYPE, + HFS_CREATOR, +}; + +static int __hfs_setxattr(struct inode *inode, enum hfs_xattr_type type, + const void *value, size_t size, int flags) { struct hfs_find_data fd; hfs_cat_rec rec; @@ -36,18 +40,22 @@ int hfs_setxattr(struct dentry *unused, struct inode *inode, sizeof(struct hfs_cat_file)); file = &rec.file; - if (!strcmp(name, "hfs.type")) { + switch (type) { + case HFS_TYPE: if (size == 4) memcpy(&file->UsrWds.fdType, value, 4); else res = -ERANGE; - } else if (!strcmp(name, "hfs.creator")) { + break; + + case HFS_CREATOR: if (size == 4) memcpy(&file->UsrWds.fdCreator, value, 4); else res = -ERANGE; - } else - res = -EOPNOTSUPP; + break; + } + if (!res) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); @@ -56,8 +64,8 @@ out: return res; } -ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode, - const char *name, void *value, size_t size) +static ssize_t __hfs_getxattr(struct inode *inode, enum hfs_xattr_type type, + void *value, size_t size) { struct hfs_find_data fd; hfs_cat_rec rec; @@ -80,41 +88,64 @@ ssize_t hfs_getxattr(struct dentry *unused, struct inode *inode, } file = &rec.file; - if (!strcmp(name, "hfs.type")) { + switch (type) { + case HFS_TYPE: if (size >= 4) { memcpy(value, &file->UsrWds.fdType, 4); res = 4; } else res = size ? -ERANGE : 4; - } else if (!strcmp(name, "hfs.creator")) { + break; + + case HFS_CREATOR: if (size >= 4) { memcpy(value, &file->UsrWds.fdCreator, 4); res = 4; } else res = size ? -ERANGE : 4; - } else - res = -ENODATA; + break; + } + out: if (size) hfs_find_exit(&fd); return res; } -#define HFS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type")) - -ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +static int hfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) { - struct inode *inode = d_inode(dentry); + return __hfs_getxattr(inode, handler->flags, value, size); +} - if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) +static int hfs_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) +{ + if (!value) return -EOPNOTSUPP; - if (!buffer || !size) - return HFS_ATTRLIST_SIZE; - if (size < HFS_ATTRLIST_SIZE) - return -ERANGE; - strcpy(buffer, "hfs.type"); - strcpy(buffer + sizeof("hfs.type"), "hfs.creator"); - - return HFS_ATTRLIST_SIZE; + return __hfs_setxattr(inode, handler->flags, value, size, flags); } + +static const struct xattr_handler hfs_creator_handler = { + .name = "hfs.creator", + .flags = HFS_CREATOR, + .get = hfs_xattr_get, + .set = hfs_xattr_set, +}; + +static const struct xattr_handler hfs_type_handler = { + .name = "hfs.type", + .flags = HFS_TYPE, + .get = hfs_xattr_get, + .set = hfs_xattr_set, +}; + +const struct xattr_handler *hfs_xattr_handlers[] = { + &hfs_creator_handler, + &hfs_type_handler, + NULL +}; diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 8f4afd3..8a66405 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -125,7 +125,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i goto err1; dir->i_size++; - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); hfs_find_exit(&fd); return 0; @@ -261,7 +261,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) } dir->i_size--; - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); res = 0; out: @@ -321,7 +321,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; dst_dir->i_size++; - dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); mark_inode_dirty(dst_dir); /* finally remove the old entry */ @@ -333,7 +333,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; src_dir->i_size--; - src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); mark_inode_dirty(src_dir); type = entry.type; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 163190e..5de5c48 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -268,7 +268,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) if (res) return res; clear_nlink(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; @@ -286,10 +286,14 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * XXX: how do you handle must_be dir? */ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { int res; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 16f5172..4cdec5a 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -212,11 +212,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags); -extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size); -extern ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern const struct xattr_handler *hfs_xattr_handlers[]; /* mdb.c */ extern int hfs_mdb_get(struct super_block *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c6a3241..f776acf 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -15,6 +15,7 @@ #include <linux/mpage.h> #include <linux/sched.h> #include <linux/uio.h> +#include <linux/xattr.h> #include "hfs_fs.h" #include "btree.h" @@ -193,7 +194,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -605,7 +606,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = inode_change_ok(inode, attr); /* basic permission checks */ + error = setattr_prepare(dentry, attr); /* basic permission checks */ if (error) return error; @@ -687,7 +688,5 @@ static const struct file_operations hfs_file_operations = { static const struct inode_operations hfs_file_inode_operations = { .lookup = hfs_file_lookup, .setattr = hfs_inode_setattr, - .setxattr = hfs_setxattr, - .getxattr = hfs_getxattr, - .listxattr = hfs_listxattr, + .listxattr = generic_listxattr, }; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 1ca95c2..bf6304a 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -406,6 +406,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_op = &hfs_super_operations; + sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= MS_NODIRATIME; mutex_init(&sbi->bitmap_lock); diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 142534d..a5e00f7 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -303,7 +303,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dir->i_size++; if (S_ISDIR(inode->i_mode)) hfsplus_subfolders_inc(dir); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); hfs_find_exit(&fd); @@ -400,7 +400,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(dir); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { @@ -469,7 +469,7 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_size++; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_inc(dst_dir); - dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; + dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); /* finally remove the old entry */ err = hfsplus_cat_build_key(sb, src_fd.search_key, @@ -486,7 +486,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(src_dir); - src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; + src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); /* remove old thread entry */ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 42e1286..31d5e3f 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -347,7 +347,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); ihold(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); sbi->file_count++; hfsplus_mark_mdb_dirty(dst_dir->i_sb); @@ -406,7 +406,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) hfsplus_delete_inode(inode); } else sbi->file_count--; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); @@ -427,7 +427,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) if (res) goto out; clear_nlink(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); hfsplus_delete_inode(inode); mark_inode_dirty(inode); out: @@ -530,10 +530,14 @@ static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { int res; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) @@ -562,10 +566,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .symlink = hfsplus_symlink, .mknod = hfsplus_mknod, .rename = hfsplus_rename, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, .set_acl = hfsplus_set_posix_acl, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 19462d7..2e796f8 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -245,7 +245,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -333,10 +333,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, - .removexattr = generic_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, .set_acl = hfsplus_set_posix_acl, @@ -369,7 +366,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 32a49e2..99627f8 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -122,7 +122,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) else hip->userflags &= ~HFSPLUS_FLG_NODUMP; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out_unlock_inode: diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index ab7ea25..9b92058 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -65,8 +65,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, case ACL_TYPE_ACCESS: xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - if (err < 0) + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) return err; } err = 0; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 90e46cd..23e15ea 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -812,7 +812,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) int fd = HOSTFS_I(inode)->fd; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -885,7 +885,7 @@ static const struct inode_operations hostfs_dir_iops = { .mkdir = hostfs_mkdir, .rmdir = hostfs_rmdir, .mknod = hostfs_mknod, - .rename2 = hostfs_rename2, + .rename = hostfs_rename2, .permission = hostfs_permission, .setattr = hostfs_setattr, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index d3bcdd9..b3be1b5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -189,6 +189,11 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, hpfs_get_block); } +static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); +} + const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, @@ -214,4 +219,5 @@ const struct file_operations hpfs_file_ops = const struct inode_operations hpfs_file_iops = { .setattr = hpfs_setattr, + .fiemap = hpfs_fiemap, }; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 1f3c6d7..b9c724e 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -273,7 +273,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) goto out_unlock; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bb8d67e..f30c144 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -507,7 +507,8 @@ const struct address_space_operations hpfs_symlink_aops = { }; static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { const unsigned char *old_name = old_dentry->d_name.name; unsigned old_len = old_dentry->d_name.len; @@ -524,6 +525,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct fnode *fnode; int err; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + if ((err = hpfs_chk_name(new_name, &new_len))) return err; err = 0; hpfs_adjust_length(old_name, &old_len); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ea71eb..4fb7b10 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -416,7 +416,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - bool rsv_on_error; u32 hash; /* @@ -458,18 +457,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * cache (remove_huge_page) BEFORE removing the * region/reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the - * region/reserve map could fail. Before free'ing - * the page, note PagePrivate which is used in case - * of error. + * region/reserve map could fail. Correspondingly, + * the subpool and global reserve usage count can need + * to be adjusted. */ - rsv_on_error = !PagePrivate(page); + VM_BUG_ON(PagePrivate(page)); remove_huge_page(page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, next, next + 1, 1))) - hugetlb_fix_reserve_counts(inode, - rsv_on_error); + hugetlb_fix_reserve_counts(inode); } unlock_page(page); @@ -657,7 +655,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); out: inode_unlock(inode); return error; @@ -672,7 +670,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) BUG_ON(!inode); - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -702,7 +700,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); info = HUGETLBFS_I(inode); mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; @@ -741,7 +739,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* @@ -790,7 +788,7 @@ static int hugetlbfs_mknod(struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; @@ -827,7 +825,7 @@ static int hugetlbfs_symlink(struct inode *dir, } else iput(inode); } - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); return error; } @@ -140,6 +140,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fop = &no_open_fops; inode->__i_nlink = 1; inode->i_opflags = 0; + if (sb->s_xattr) + inode->i_opflags |= IOP_XATTR; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); @@ -1021,13 +1023,17 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1064,6 +1070,10 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; @@ -1091,12 +1101,16 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } return inode; } @@ -1131,6 +1145,10 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) destroy_inode(inode); inode = old; wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } } return inode; } @@ -1266,10 +1284,16 @@ EXPORT_SYMBOL(ilookup5_nowait); struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - - if (inode) + struct inode *inode; +again: + inode = ilookup5_nowait(sb, hashval, test, data); + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup5); @@ -1286,13 +1310,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; - +again: spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); spin_unlock(&inode_hash_lock); - if (inode) + if (inode) { wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } return inode; } EXPORT_SYMBOL(ilookup); @@ -1536,16 +1565,36 @@ sector_t bmap(struct inode *inode, sector_t block) EXPORT_SYMBOL(bmap); /* + * Update times in overlayed inode from underlying real inode + */ +static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, + bool rcu) +{ + if (!rcu) { + struct inode *realinode = d_real_inode(dentry); + + if (unlikely(inode != realinode) && + (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + } + } +} + +/* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, - struct timespec now) +static int relatime_need_update(const struct path *path, struct inode *inode, + struct timespec now, bool rcu) { - if (!(mnt->mnt_flags & MNT_RELATIME)) + if (!(path->mnt->mnt_flags & MNT_RELATIME)) return 1; + + update_ovl_inode_times(path->dentry, inode, rcu); /* * Is mtime younger than atime? If yes, update atime: */ @@ -1612,7 +1661,8 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -bool atime_needs_update(const struct path *path, struct inode *inode) +bool __atime_needs_update(const struct path *path, struct inode *inode, + bool rcu) { struct vfsmount *mnt = path->mnt; struct timespec now; @@ -1636,9 +1686,9 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; - now = current_fs_time(inode->i_sb); + now = current_time(inode); - if (!relatime_need_update(mnt, inode, now)) + if (!relatime_need_update(path, inode, now, rcu)) return false; if (timespec_equal(&inode->i_atime, &now)) @@ -1653,7 +1703,7 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec now; - if (!atime_needs_update(path, inode)) + if (!__atime_needs_update(path, inode, false)) return; if (!sb_start_write_trylock(inode->i_sb)) @@ -1670,7 +1720,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ - now = current_fs_time(inode->i_sb); + now = current_time(inode); update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: @@ -1793,7 +1843,7 @@ int file_update_time(struct file *file) if (IS_NOCMTIME(inode)) return 0; - now = current_fs_time(inode->i_sb); + now = current_time(inode); if (!timespec_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; @@ -2049,3 +2099,26 @@ void inode_nohighmem(struct inode *inode) mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); + +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +struct timespec current_time(struct inode *inode) +{ + struct timespec now = current_kernel_time(); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timespec_trunc(now, inode->i_sb->s_time_gran); +} +EXPORT_SYMBOL(current_time); diff --git a/fs/internal.h b/fs/internal.h index ba07376..f4da334 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@ struct super_block; struct file_system_type; struct iomap; +struct iomap_ops; struct linux_binprm; struct path; struct mount; @@ -120,6 +121,15 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); +extern bool __atime_needs_update(const struct path *, struct inode *, bool); +static inline bool atime_needs_update_rcu(const struct path *path, + struct inode *inode) +{ + return __atime_needs_update(path, inode, true); +} + +extern bool atime_needs_update_rcu(const struct path *, struct inode *); + /* * fs-writeback.c */ @@ -156,7 +166,7 @@ extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ -extern struct dentry_operations ns_dentry_operations; +extern const struct dentry_operations ns_dentry_operations; /* * fs/ioctl.c @@ -164,3 +174,13 @@ extern struct dentry_operations ns_dentry_operations; extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* + * iomap support: + */ +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap_ops *ops, void *data, + iomap_actor_t actor); @@ -27,9 +27,6 @@ #include <linux/dax.h> #include "internal.h" -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); - /* * Execute a iomap write on a segment of the mapping that spans a * contiguous range of pages that have identical block mapping state. @@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, * resources they require in the iomap_begin call, and release them in the * iomap_end call. */ -static loff_t +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor) { @@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, + &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { @@ -354,8 +433,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, struct page *page = data; int ret; - ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, - NULL, iomap); + ret = __block_write_begin_int(page, pos, length, NULL, iomap); if (ret) return ret; @@ -430,6 +508,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, @@ -480,7 +560,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } while (len > 0) { - ret = iomap_apply(inode, start, len, 0, ops, &ctx, + ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); /* inode with no (attribute) mapping will give ENOENT */ if (ret == -ENOENT) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ad0c745..871c8b3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -687,6 +687,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) pri_bh = NULL; root_found: + /* We don't support read-write mounts */ + if (!(s->s_flags & MS_RDONLY)) { + error = -EACCES; + goto out_freebh; + } if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. @@ -1501,9 +1506,6 @@ struct inode *__isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - /* We don't support read-write mounts */ - if (!(flags & MS_RDONLY)) - return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5bb565f..31f8ca0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -269,8 +269,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, * filemap_fdatawait_range(), set it again so * that user process can get -EIO from fsync(). */ - set_bit(AS_EIO, - &jinode->i_vfs_inode->i_mapping->flags); + mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); if (!ret) ret = err; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 46261a6..927da49 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1090,11 +1090,15 @@ static void jbd2_stats_proc_exit(journal_t *journal) * very few fields yet: that has to wait until we have created the * journal structures from from scratch, or loaded them from disk. */ -static journal_t * journal_init_common (void) +static journal_t *journal_init_common(struct block_device *bdev, + struct block_device *fs_dev, + unsigned long long start, int len, int blocksize) { static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; + struct buffer_head *bh; + int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) @@ -1131,6 +1135,32 @@ static journal_t * journal_init_common (void) lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", &jbd2_trans_commit_key, 0); + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!journal->j_wbuf) { + kfree(journal); + return NULL; + } + + bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); + if (!bh) { + pr_err("%s: Cannot get buffer for journal superblock\n", + __func__); + kfree(journal->j_wbuf); + kfree(journal); + return NULL; + } + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + return journal; } @@ -1157,51 +1187,21 @@ static journal_t * journal_init_common (void) * range of blocks on an arbitrary block device. * */ -journal_t * jbd2_journal_init_dev(struct block_device *bdev, +journal_t *jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { - journal_t *journal = journal_init_common(); - struct buffer_head *bh; - int n; + journal_t *journal; + journal = journal_init_common(bdev, fs_dev, start, len, blocksize); if (!journal) return NULL; - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /** @@ -1212,67 +1212,36 @@ out_err: * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. */ -journal_t * jbd2_journal_init_inode (struct inode *inode) +journal_t *jbd2_journal_init_inode(struct inode *inode) { - struct buffer_head *bh; - journal_t *journal = journal_init_common(); + journal_t *journal; char *p; - int err; - int n; unsigned long long blocknr; + blocknr = bmap(inode, 0); + if (!blocknr) { + pr_err("%s: Cannot locate journal superblock\n", + __func__); + return NULL; + } + + jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, + blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, + inode->i_sb->s_blocksize); if (!journal) return NULL; - journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); - jbd_debug(1, - "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, - (long long) inode->i_size, - inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - - journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; - journal->j_blocksize = inode->i_sb->s_blocksize; jbd2_stats_proc_init(journal); - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", - __func__); - goto out_err; - } - - err = jbd2_journal_bmap(journal, 0, &blocknr); - /* If that failed, give up */ - if (err) { - printk(KERN_ERR "%s: Cannot locate journal superblock\n", - __func__); - goto out_err; - } - - bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - printk(KERN_ERR - "%s: Cannot get buffer for journal superblock\n", - __func__); - goto out_err; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - return journal; -out_err: - kfree(journal->j_wbuf); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b5bc3e2..e165266 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -159,6 +159,7 @@ static void wait_transaction_locked(journal_t *journal) read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); + jbd2_might_wait_for_commit(journal); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); } @@ -182,8 +183,6 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; - jbd2_might_wait_for_commit(journal); - /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -214,6 +213,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (atomic_read(&journal->j_reserved_credits) + total > journal->j_max_transaction_buffers) { read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); @@ -238,6 +238,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) { atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); write_lock(&journal->j_state_lock); if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) __jbd2_log_wait_for_space(journal); @@ -255,6 +256,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, sub_reserved_credits(journal, rsv_blocks); atomic_sub(total, &t->t_outstanding_credits); read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); @@ -1147,6 +1149,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; @@ -1154,8 +1157,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); } - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); /* diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index bc2693d..7ebacf1 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -233,22 +233,21 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); - if (rc < 0) + umode_t mode; + + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) return rc; if (inode->i_mode != mode) { struct iattr attr; attr.ia_valid = ATTR_MODE | ATTR_CTIME; attr.ia_mode = mode; - attr.ia_ctime = CURRENT_TIME_SEC; + attr.ia_ctime = current_time(inode); rc = jffs2_do_setattr(inode, &attr); if (rc < 0) return rc; } - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 30eb33ff..0a754f3 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -35,7 +35,8 @@ static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); static int jffs2_rename (struct inode *, struct dentry *, - struct inode *, struct dentry *); + struct inode *, struct dentry *, + unsigned int); const struct file_operations jffs2_dir_operations = { @@ -61,10 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations = .get_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, - .setxattr = jffs2_setxattr, - .getxattr = jffs2_getxattr, .listxattr = jffs2_listxattr, - .removexattr = jffs2_removexattr }; /***********************************************************************/ @@ -759,7 +757,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode } static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, - struct inode *new_dir_i, struct dentry *new_dentry) + struct inode *new_dir_i, struct dentry *new_dentry, + unsigned int flags) { int ret; struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb); @@ -767,6 +766,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, uint8_t type; uint32_t now; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + /* The VFS will check for us and prevent trying to rename a * file over a directory and vice versa, but if it's a directory, * the VFS can't check whether the victim is empty. The filesystem diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 0e62dec..c12476e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -66,10 +66,7 @@ const struct inode_operations jffs2_file_inode_operations = .get_acl = jffs2_get_acl, .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, - .setxattr = jffs2_setxattr, - .getxattr = jffs2_getxattr, .listxattr = jffs2_listxattr, - .removexattr = jffs2_removexattr }; const struct address_space_operations jffs2_file_address_operations = diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ae2ebb2..567653f 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -193,7 +193,7 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int rc; - rc = inode_change_ok(inode, iattr); + rc = setattr_prepare(dentry, iattr); if (rc) return rc; @@ -472,7 +472,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_mode = jemode_to_cpu(ri->mode); i_gid_write(inode, je16_to_cpu(ri->gid)); i_uid_write(inode, je16_to_cpu(ri->uid)); - inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode); ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); inode->i_blocks = 0; diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 2cabd64..8f3f085 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -16,8 +16,5 @@ const struct inode_operations jffs2_symlink_inode_operations = .readlink = generic_readlink, .get_link = simple_get_link, .setattr = jffs2_setattr, - .setxattr = jffs2_setxattr, - .getxattr = jffs2_getxattr, .listxattr = jffs2_listxattr, - .removexattr = jffs2_removexattr }; diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 467ff37..720007b 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -99,9 +99,6 @@ extern const struct xattr_handler jffs2_user_xattr_handler; extern const struct xattr_handler jffs2_trusted_xattr_handler; extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); -#define jffs2_getxattr generic_getxattr -#define jffs2_setxattr generic_setxattr -#define jffs2_removexattr generic_removexattr #else @@ -116,9 +113,6 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #define jffs2_xattr_handlers NULL #define jffs2_listxattr NULL -#define jffs2_getxattr NULL -#define jffs2_setxattr NULL -#define jffs2_removexattr NULL #endif /* CONFIG_JFFS2_FS_XATTR */ diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 21fa92b..7bc186f 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -78,13 +78,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - if (rc < 0) + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) return rc; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f1a585..739492c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -103,7 +103,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int rc; - rc = inode_change_ok(inode, iattr); + rc = setattr_prepare(dentry, iattr); if (rc) return rc; @@ -140,10 +140,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) } const struct inode_operations jfs_file_inode_operations = { - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = generic_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index ad3e7b1..054cc76 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -403,7 +403,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) break; } - ip->i_mtime = ip->i_ctime = CURRENT_TIME; + ip->i_mtime = ip->i_ctime = current_time(ip); mark_inode_dirty(ip); txCommit(tid, 1, &ip, 0); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 5e33cb9..375dd25 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -131,7 +131,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); jfs_inode->otime = inode->i_ctime.tv_sec; inode->i_generation = JFS_SBI(sb)->gengen++; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 2e58978..4d973524 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -2893,8 +2893,7 @@ restart: * on anon_list2. Let's check. */ if (!list_empty(&TxAnchor.anon_list2)) { - list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); - INIT_LIST_HEAD(&TxAnchor.anon_list2); + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); goto restart; } TXN_UNLOCK(); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 814b0c5..b41596d 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -162,7 +162,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_ctime = dip->i_mtime = current_time(dip); mark_inode_dirty(dip); @@ -298,7 +298,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ - dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_ctime = dip->i_mtime = current_time(dip); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); @@ -406,7 +406,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ - dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_ctime = dip->i_mtime = current_time(dip); inode_dec_link_count(dip); /* @@ -528,7 +528,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) ASSERT(ip->i_nlink); - ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME; + ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip); mark_inode_dirty(dip); /* update target's inode */ @@ -838,8 +838,8 @@ static int jfs_link(struct dentry *old_dentry, /* update object inode */ inc_nlink(ip); /* for new link */ - ip->i_ctime = CURRENT_TIME; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ip->i_ctime = current_time(ip); + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); ihold(ip); @@ -1039,7 +1039,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_ctime = dip->i_mtime = current_time(dip); mark_inode_dirty(dip); /* * commit update of parent directory and link object @@ -1078,7 +1078,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, * FUNCTION: rename a file or directory */ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btstack btstack; ino_t ino; @@ -1097,6 +1098,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, s64 new_size = 0; int commit_flag; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); @@ -1215,7 +1218,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else { - new_ip->i_ctime = CURRENT_TIME; + new_ip->i_ctime = current_time(new_ip); mark_inode_dirty(new_ip); } } else { @@ -1278,10 +1281,10 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* * Update ctime on changed/moved inodes & mark dirty */ - old_ip->i_ctime = CURRENT_TIME; + old_ip->i_ctime = current_time(old_ip); mark_inode_dirty(old_ip); - new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); + new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ @@ -1293,7 +1296,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { iplist[ipcount++] = new_dir; - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); mark_inode_dirty(old_dir); } @@ -1426,7 +1429,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, mark_inode_dirty(ip); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); @@ -1537,10 +1540,7 @@ const struct inode_operations jfs_dir_inode_operations = { .rmdir = jfs_rmdir, .mknod = jfs_mknod, .rename = jfs_rename, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = generic_removexattr, .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 90b3bc2..bd9b641 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -379,8 +379,14 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * cached in meta-data cache, and not written out * by txCommit(); */ - filemap_fdatawait(ipbmap->i_mapping); - filemap_write_and_wait(ipbmap->i_mapping); + rc = filemap_fdatawait(ipbmap->i_mapping); + if (rc) + goto error_out; + + rc = filemap_write_and_wait(ipbmap->i_mapping); + if (rc) + goto error_out; + diWriteSpecial(ipbmap, 0); newPage = nPages; /* first new page number */ diff --git a/fs/jfs/super.c b/fs/jfs/super.c index cec8814..85671f7 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -830,7 +830,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index c94c7e4..c82404f 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -25,19 +25,13 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, .get_link = simple_get_link, .setattr = jfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = generic_removexattr, }; const struct inode_operations jfs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = jfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = jfs_listxattr, - .removexattr = generic_removexattr, }; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 0bf3c33..c60f3d3 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -658,7 +658,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, if (old_blocks) dquot_free_block(inode, old_blocks); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); return 0; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e57174d..cf4c636 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@ -186,29 +180,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ -size_t kernfs_path_len(struct kernfs_node *kn) -{ - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; -} - -/** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest @@ -220,8 +191,9 @@ size_t kernfs_path_len(struct kernfs_node *kn) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@ -237,28 +209,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) -{ - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; -} -EXPORT_SYMBOL_GPL(kernfs_path); - -/** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * @@ -1096,13 +1046,17 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) } static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct kernfs_node *kn = old_dentry->d_fsdata; struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; + if (flags) + return -EINVAL; + if (!scops || !scops->rename) return -EPERM; @@ -1126,9 +1080,6 @@ const struct inode_operations kernfs_dir_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, - .setxattr = kernfs_iop_setxattr, - .removexattr = kernfs_iop_removexattr, - .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 63b925d..a198211 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -28,9 +28,6 @@ static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, - .setxattr = kernfs_iop_setxattr, - .removexattr = kernfs_iop_removexattr, - .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, }; @@ -122,7 +119,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) return -EINVAL; mutex_lock(&kernfs_mutex); - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) goto out; @@ -138,17 +135,12 @@ out: return error; } -static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, +static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, u32 *secdata_len) { - struct kernfs_iattrs *attrs; void *old_secdata; size_t old_secdata_len; - attrs = kernfs_iattrs(kn); - if (!attrs) - return -ENOMEM; - old_secdata = attrs->ia_secdata; old_secdata_len = attrs->ia_secdata_len; @@ -160,71 +152,6 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, return 0; } -int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - struct kernfs_node *kn = inode->i_private; - struct kernfs_iattrs *attrs; - void *secdata; - int error; - u32 secdata_len = 0; - - attrs = kernfs_iattrs(kn); - if (!attrs) - return -ENOMEM; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(inode, suffix, - value, size, flags); - if (error) - return error; - error = security_inode_getsecctx(inode, - &secdata, &secdata_len); - if (error) - return error; - - mutex_lock(&kernfs_mutex); - error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); - mutex_unlock(&kernfs_mutex); - - if (secdata) - security_release_secctx(secdata, secdata_len); - return error; - } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { - return simple_xattr_set(&attrs->xattrs, name, value, size, - flags); - } - - return -EINVAL; -} - -int kernfs_iop_removexattr(struct dentry *dentry, const char *name) -{ - struct kernfs_node *kn = dentry->d_fsdata; - struct kernfs_iattrs *attrs; - - attrs = kernfs_iattrs(kn); - if (!attrs) - return -ENOMEM; - - return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE); -} - -ssize_t kernfs_iop_getxattr(struct dentry *unused, struct inode *inode, - const char *name, void *buf, size_t size) -{ - struct kernfs_node *kn = inode->i_private; - struct kernfs_iattrs *attrs; - - attrs = kernfs_iattrs(kn); - if (!attrs) - return -ENOMEM; - - return simple_xattr_get(&attrs->xattrs, name, buf, size); -} - ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = dentry->d_fsdata; @@ -241,7 +168,7 @@ static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); } static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) @@ -376,3 +303,83 @@ int kernfs_iop_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } + +static int kernfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, void *value, size_t size) +{ + const char *name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, value, size); +} + +static int kernfs_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_set(&attrs->xattrs, name, value, size, flags); +} + +const struct xattr_handler kernfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = kernfs_xattr_get, + .set = kernfs_xattr_set, +}; + +static int kernfs_security_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + void *secdata; + u32 secdata_len = 0; + int error; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + error = security_inode_setsecurity(inode, suffix, value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(inode, &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(attrs, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; +} + +const struct xattr_handler kernfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = kernfs_xattr_get, + .set = kernfs_security_xattr_set, +}; + +const struct xattr_handler *kernfs_xattr_handlers[] = { + &kernfs_trusted_xattr_handler, + &kernfs_security_xattr_handler, + NULL +}; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 3715923..bfd551b 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -76,17 +76,12 @@ extern struct kmem_cache *kernfs_node_cache; /* * inode.c */ +extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags); -int kernfs_iop_removexattr(struct dentry *dentry, const char *name); -ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *buf, size_t size); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); /* diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index b3d73ad..d5b149a 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -158,6 +158,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; sb->s_op = &kernfs_sops; + sb->s_xattr = kernfs_xattr_handlers; sb->s_time_gran = 1; /* get root inode, initialize and unlock it */ diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 117b8b3..9b43ca0 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -134,9 +134,6 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, } const struct inode_operations kernfs_symlink_iops = { - .setxattr = kernfs_iop_setxattr, - .removexattr = kernfs_iop_removexattr, - .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, .readlink = generic_readlink, .get_link = kernfs_iop_get_link, @@ -236,8 +236,8 @@ static const struct super_operations simple_super_operations = { * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ -struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, +struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, const struct xattr_handler **xattr, const struct dentry_operations *dops, unsigned long magic) { struct super_block *s; @@ -254,6 +254,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = ops ? ops : &simple_super_operations; + s->s_xattr = xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) @@ -265,7 +266,7 @@ struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + root->i_atime = root->i_mtime = root->i_ctime = current_time(root); dentry = __d_alloc(s, &d_name); if (!dentry) { iput(root); @@ -281,7 +282,7 @@ Enomem: deactivate_locked_super(s); return ERR_PTR(-ENOMEM); } -EXPORT_SYMBOL(mount_pseudo); +EXPORT_SYMBOL(mount_pseudo_xattr); int simple_open(struct inode *inode, struct file *file) { @@ -295,7 +296,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inc_nlink(inode); ihold(inode); dget(dentry); @@ -329,7 +330,7 @@ int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); drop_nlink(inode); dput(dentry); return 0; @@ -349,11 +350,15 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) EXPORT_SYMBOL(simple_rmdir); int simple_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + if (!simple_empty(new_dentry)) return -ENOTEMPTY; @@ -369,7 +374,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, } old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = - new_dir->i_mtime = inode->i_ctime = CURRENT_TIME; + new_dir->i_mtime = inode->i_ctime = current_time(old_dir); return 0; } @@ -394,7 +399,7 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) return error; @@ -520,7 +525,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); @@ -546,7 +551,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, goto out; } inode->i_mode = S_IFREG | files->mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); @@ -1092,7 +1097,7 @@ struct inode *alloc_anon_inode(struct super_block *s) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); @@ -1149,24 +1154,6 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; } -static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static ssize_t empty_dir_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *value, size_t size) -{ - return -EOPNOTSUPP; -} - -static int empty_dir_removexattr(struct dentry *dentry, const char *name) -{ - return -EOPNOTSUPP; -} - static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; @@ -1177,9 +1164,6 @@ static const struct inode_operations empty_dir_inode_operations = { .permission = generic_permission, .setattr = empty_dir_setattr, .getattr = empty_dir_getattr, - .setxattr = empty_dir_setxattr, - .getxattr = empty_dir_getxattr, - .removexattr = empty_dir_removexattr, .listxattr = empty_dir_listxattr, }; @@ -1215,6 +1199,7 @@ void make_empty_dir_inode(struct inode *inode) inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; + inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h index 2257a13..184a15e 100644 --- a/fs/lockd/procfs.h +++ b/fs/lockd/procfs.h @@ -6,8 +6,6 @@ #ifndef _LOCKD_PROCFS_H #define _LOCKD_PROCFS_H -#include <linux/kconfig.h> - #if IS_ENABLED(CONFIG_PROC_FS) int lockd_create_procfs(void); void lockd_remove_procfs(void); @@ -127,7 +127,6 @@ #include <linux/pid_namespace.h> #include <linux/hashtable.h> #include <linux/percpu.h> -#include <linux/lglock.h> #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> @@ -139,6 +138,11 @@ #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +static inline bool is_remote_lock(struct file *filp) +{ + return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); +} + static bool lease_breaking(struct file_lock *fl) { return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); @@ -158,12 +162,18 @@ int lease_break_time = 45; /* * The global file_lock_list is only used for displaying /proc/locks, so we - * keep a list on each CPU, with each list protected by its own spinlock via - * the file_lock_lglock. Note that alterations to the list also require that - * the relevant flc_lock is held. + * keep a list on each CPU, with each list protected by its own spinlock. + * Global serialization is done using file_rwsem. + * + * Note that alterations to the list also require that the relevant flc_lock is + * held. */ -DEFINE_STATIC_LGLOCK(file_lock_lglock); -static DEFINE_PER_CPU(struct hlist_head, file_lock_list); +struct file_lock_list_struct { + spinlock_t lock; + struct hlist_head hlist; +}; +static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); +DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. @@ -587,15 +597,23 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { - lg_local_lock(&file_lock_lglock); + struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list); + + percpu_rwsem_assert_held(&file_rwsem); + + spin_lock(&fll->lock); fl->fl_link_cpu = smp_processor_id(); - hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list)); - lg_local_unlock(&file_lock_lglock); + hlist_add_head(&fl->fl_link, &fll->hlist); + spin_unlock(&fll->lock); } /* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { + struct file_lock_list_struct *fll; + + percpu_rwsem_assert_held(&file_rwsem); + /* * Avoid taking lock if already unhashed. This is safe since this check * is done while holding the flc_lock, and new insertions into the list @@ -603,9 +621,11 @@ static void locks_delete_global_locks(struct file_lock *fl) */ if (hlist_unhashed(&fl->fl_link)) return; - lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu); + + fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu); + spin_lock(&fll->lock); hlist_del_init(&fl->fl_link); - lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu); + spin_unlock(&fll->lock); } static unsigned long @@ -791,7 +811,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -915,6 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) return -ENOMEM; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; @@ -955,6 +976,7 @@ find_conflict: out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -991,6 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If @@ -1162,6 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); /* * Free any unused locks. */ @@ -1192,7 +1216,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return posix_lock_inode(file_inode(filp), fl, conflock); + return posix_lock_inode(locks_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1232,7 +1256,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int locks_mandatory_locked(struct file *file) { int ret; - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct file_lock_context *ctx; struct file_lock *fl; @@ -1436,6 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1487,9 +1512,13 @@ restart: locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); + + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); @@ -1506,6 +1535,7 @@ restart: } out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1539,7 +1569,7 @@ void lease_get_mtime(struct inode *inode, struct timespec *time) } if (has_lease) - *time = current_fs_time(inode->i_sb); + *time = current_time(inode); else *time = inode->i_mtime; } @@ -1572,15 +1602,16 @@ EXPORT_SYMBOL(lease_get_mtime); int fcntl_getlease(struct file *filp) { struct file_lock *fl; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); - time_out_leases(file_inode(filp), &dispose); + time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file != filp) continue; @@ -1588,6 +1619,8 @@ int fcntl_getlease(struct file *filp) break; } spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } return type; @@ -1613,7 +1646,8 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && + (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || @@ -1628,7 +1662,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = file_inode(filp); + struct inode *inode = dentry->d_inode; struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; @@ -1660,6 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg, lease->fl_flags); @@ -1730,6 +1765,7 @@ out_setup: lease->fl_lmops->lm_setup(lease, priv); out: spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); @@ -1742,7 +1778,7 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; LIST_HEAD(dispose); @@ -1752,6 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner) return error; } + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (fl->fl_file == filp && @@ -1764,6 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner) if (victim) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); return error; } @@ -1782,7 +1820,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1830,7 +1868,7 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - if (filp->f_op->setlease) + if (filp->f_op->setlease && is_remote_lock(filp)) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); @@ -1979,7 +2017,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op->flock) + if (f.file->f_op->flock && is_remote_lock(f.file)) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -2005,7 +2043,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -2129,7 +2167,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -2191,7 +2229,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - inode = file_inode(filp); + inode = locks_inode(filp); /* * This might block, so we do it before checking the inode. @@ -2343,7 +2381,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); + inode = locks_inode(filp); /* Don't allow mandatory locks on files that may be memory mapped * and shared. @@ -2426,6 +2464,7 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { int error; + struct inode *inode = locks_inode(filp); struct file_lock lock; struct file_lock_context *ctx; @@ -2434,7 +2473,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2452,7 +2491,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); - trace_locks_remove_posix(file_inode(filp), &lock, error); + trace_locks_remove_posix(inode, &lock, error); } EXPORT_SYMBOL(locks_remove_posix); @@ -2469,12 +2508,12 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return; - if (filp->f_op->flock) + if (filp->f_op->flock && is_remote_lock(filp)) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); @@ -2493,11 +2532,14 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) if (list_empty(&ctx->flc_lease)) return; + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } @@ -2508,7 +2550,7 @@ void locks_remove_file(struct file *filp) { struct file_lock_context *ctx; - ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + ctx = smp_load_acquire(&locks_inode(filp)->i_flctx); if (!ctx) return; @@ -2552,7 +2594,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock) + if (filp->f_op->lock && is_remote_lock(filp)) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } @@ -2574,13 +2616,24 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; - if (fl->fl_nspid) - fl_pid = pid_vnr(fl->fl_nspid); - else + if (fl->fl_nspid) { + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; + + /* Don't let fl_pid change based on who is reading the file */ + fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); + + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; + } else fl_pid = fl->fl_pid; if (fl->fl_file != NULL) - inode = file_inode(fl->fl_file); + inode = locks_inode(fl->fl_file); seq_printf(f, "%lld:%s ", id, pfx); if (IS_POSIX(fl)) { @@ -2648,9 +2701,13 @@ static int locks_show(struct seq_file *f, void *v) { struct locks_iterator *iter = f->private; struct file_lock *fl, *bfl; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; fl = hlist_entry(v, struct file_lock, fl_link); + if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + return 0; + lock_get_status(f, fl, iter->li_pos, ""); list_for_each_entry(bfl, &fl->fl_block, fl_block) @@ -2682,7 +2739,7 @@ static void __show_fd_locks(struct seq_file *f, void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) { - struct inode *inode = file_inode(filp); + struct inode *inode = locks_inode(filp); struct file_lock_context *ctx; int id = 0; @@ -2703,9 +2760,9 @@ static void *locks_start(struct seq_file *f, loff_t *pos) struct locks_iterator *iter = f->private; iter->li_pos = *pos + 1; - lg_global_lock(&file_lock_lglock); + percpu_down_write(&file_rwsem); spin_lock(&blocked_lock_lock); - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos); + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos); } static void *locks_next(struct seq_file *f, void *v, loff_t *pos) @@ -2713,14 +2770,14 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos) struct locks_iterator *iter = f->private; ++iter->li_pos; - return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos); + return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos); } static void locks_stop(struct seq_file *f, void *v) __releases(&blocked_lock_lock) { spin_unlock(&blocked_lock_lock); - lg_global_unlock(&file_lock_lglock); + percpu_up_write(&file_rwsem); } static const struct seq_operations locks_seq_operations = { @@ -2761,10 +2818,13 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - lg_lock_init(&file_lock_lglock, "file_lock_lglock"); - for_each_possible_cpu(i) - INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i)); + for_each_possible_cpu(i) { + struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); + + spin_lock_init(&fll->lock); + INIT_HLIST_HEAD(&fll->hlist); + } return 0; } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9568064..c87ea52 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -226,7 +226,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry) ta->state = UNLINK_1; ta->ino = inode->i_ino; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); page = logfs_get_dd_page(dir, dentry); if (!page) { @@ -540,7 +540,7 @@ static int logfs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); ihold(inode); inc_nlink(inode); mark_inode_dirty_sync(inode); @@ -573,7 +573,7 @@ static int logfs_delete_dd(struct inode *dir, loff_t pos) * (crc-protected) journal. */ BUG_ON(beyond_eof(dir, pos)); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); return logfs_delete(dir, pos, NULL); } @@ -718,8 +718,12 @@ out: } static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + if (d_really_is_positive(new_dentry)) return logfs_rename_target(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index f01ddfb..1db0493 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -211,7 +211,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) li->li_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); return 0; @@ -244,7 +244,7 @@ static int logfs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int err = 0; - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index db9cfc5..f440a15 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -213,8 +213,8 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode) i_gid_write(inode, 0); inode->i_size = 0; inode->i_blocks = 0; - inode->i_ctime = CURRENT_TIME; - inode->i_mtime = CURRENT_TIME; + inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); li->li_refcount = 1; INIT_LIST_HEAD(&li->li_freeing_list); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 3fb8c6d..bf19bf4 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1546,7 +1546,7 @@ static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) int err; flags |= WF_WRITE | WF_DELETE; - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); logfs_unpack_index(index, &bix, &level); if (logfs_block(page) && logfs_block(page)->reserved_bytes) @@ -1578,7 +1578,7 @@ static int __logfs_delete(struct inode *inode, struct page *page) long flags = WF_DELETE; int err; - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); if (page->index < I0_BLOCKS) return logfs_write_direct(inode, page, flags); diff --git a/fs/mbcache.c b/fs/mbcache.c index eccda3a..c5bd19f 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -366,7 +366,11 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&cache->c_shrink); + if (register_shrinker(&cache->c_shrink)) { + kfree(cache->c_hash); + kfree(cache); + goto err_out; + } INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 742942a..c2c3fd3 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -253,7 +253,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) } inode_init_owner(inode, dir, mode); inode->i_ino = j; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 31dcd51..7edc9b3 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -274,7 +274,7 @@ got_it: de->inode = inode->i_ino; } err = dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); out_put: dir_put_page(page); @@ -306,7 +306,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) unlock_page(page); } dir_put_page(page); - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); return err; } @@ -430,7 +430,7 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page, unlock_page(page); } dir_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); } diff --git a/fs/minix/file.c b/fs/minix/file.c index 94f0eb9a..a6a4797 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -26,7 +26,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index a731cab..4c57c9a 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -124,7 +124,7 @@ static inline int splice_branch(struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -343,7 +343,7 @@ do_indirects: } first_whole++; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 2887d1d..1e0f11f 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -106,7 +106,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); @@ -185,7 +185,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) } static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir, struct dentry *new_dentry) + struct inode * new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); @@ -195,6 +196,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct minix_dir_entry * old_de; int err = -ENOENT; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; @@ -219,7 +223,7 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, if (!new_de) goto out_dir; minix_set_link(new_de, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -10,9 +10,12 @@ struct mnt_namespace { struct mount * root; struct list_head list; struct user_namespace *user_ns; + struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { @@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { + } else if (atime_needs_update_rcu(&last->link, inode)) { if (unlikely(unlazy_walk(nd, NULL, 0))) return ERR_PTR(-ECHILD); touch_atime(&last->link); @@ -4369,12 +4369,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op->rename && !old_dir->i_op->rename2) + if (!old_dir->i_op->rename) return -EPERM; - if (flags && !old_dir->i_op->rename2) - return -EINVAL; - /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. @@ -4428,14 +4425,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - if (!old_dir->i_op->rename2) { - error = old_dir->i_op->rename(old_dir, old_dentry, - new_dir, new_dentry); - } else { - WARN_ON(old_dir->i_op->rename != NULL); - error = old_dir->i_op->rename2(old_dir, old_dentry, - new_dir, new_dentry, flags); - } + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry, flags); if (error) goto out; @@ -4677,6 +4668,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(generic_readlink); +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) +{ + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); + + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; +} +EXPORT_SYMBOL(vfs_get_link); + /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda..e6c234b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + attach_shadowed(mnt, parent, shadows); touch_mnt_namespace(n); } @@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mount *child, *p; struct hlist_node *n; int err; + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; return err; } @@ -2700,7 +2743,7 @@ long do_mount(const char *dev_name, const char __user *dir_name, flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | - MS_STRICTATIME); + MS_STRICTATIME | MS_NOREMOTELOCK); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, @@ -2719,9 +2762,20 @@ dput_out: return retval; } +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + static void free_mnt_ns(struct mnt_namespace *ns) { ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } @@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + struct ucounts *ucounts; int ret; + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); - if (!new_ns) + if (!new_ns) { + dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); + dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } new_ns->ns.ops = &mntns_operations; @@ -2756,9 +2818,13 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } +__latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -2805,6 +2871,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2843,6 +2910,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); @@ -3348,10 +3416,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return 0; } +static struct user_namespace *mntns_owner(struct ns_common *ns) +{ + return to_mnt_ns(ns)->user_ns; +} + const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, + .owner = mntns_owner, }; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 17de5c1..6df2a38 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -36,7 +36,7 @@ static int ncp_unlink(struct inode *, struct dentry *); static int ncp_mkdir(struct inode *, struct dentry *, umode_t); static int ncp_rmdir(struct inode *, struct dentry *); static int ncp_rename(struct inode *, struct dentry *, - struct inode *, struct dentry *); + struct inode *, struct dentry *, unsigned int); static int ncp_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev); #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) @@ -133,12 +133,11 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) return 0; if (!ncp_case_sensitive(inode)) { - struct super_block *sb = dentry->d_sb; struct nls_table *t; unsigned long hash; int i; - t = NCP_IO_TABLE(sb); + t = NCP_IO_TABLE(dentry->d_sb); hash = init_name_hash(dentry); for (i=0; i<this->len ; i++) hash = partial_name_hash(ncp_tolower(t, this->name[i]), @@ -1106,13 +1105,17 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry) } static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct ncp_server *server = NCP_SERVER(old_dir); int error; int old_len, new_len; __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1]; + if (flags) + return -EINVAL; + ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry); ncp_age_dentry(server, old_dentry); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 1af15fc..f6cf4c7e 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -884,7 +884,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) /* ageing the dentry to force validation */ ncp_age_dentry(server, dentry); - result = inode_change_ok(inode, attr); + result = setattr_prepare(dentry, attr); if (result < 0) goto out; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 2178476..2905479 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work) u64 start = hdr->args.offset & (loff_t)PAGE_MASK; u64 end = (hdr->args.offset + hdr->args.count + PAGE_SIZE - 1) & (loff_t)PAGE_MASK; + u64 lwb = hdr->args.offset + hdr->args.count; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT, end); + (end - start) >> SECTOR_SHIFT, lwb); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 5f7b053..6de1570 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_cache_defer_req_put(dreq); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 52a2831..532d8e2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -31,8 +31,6 @@ struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - struct svc_rqst *rqst; - struct task_struct *task; }; static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; @@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp) return 0; } -/* - * Prepare to bring up the NFSv4 callback service - */ -static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) -{ - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); -} - #if defined(CONFIG_NFS_V4_1) /* * The callback service for NFSv4.1 callbacks @@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp) return 0; } -/* - * Bring up the NFSv4.1 callback service - */ -static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - - INIT_LIST_HEAD(&serv->sv_cb_list); - spin_lock_init(&serv->sv_cb_lock); - init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); - return rqstp; -} - -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = nfs41_callback_up(serv); - *callback_svc = nfs41_callback_svc; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = serv; } #else -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = ERR_PTR(-ENOTSUPP); - *callback_svc = ERR_PTR(-ENOTSUPP); -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { - struct svc_rqst *rqstp; - int (*callback_svc)(void *vrqstp); - struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int nrservs = nfs_callback_nr_threads; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); - if (cb_info->task) - return 0; + if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) + nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - switch (minorversion) { - case 0: - /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); - callback_svc = nfs4_callback_svc; - break; - default: - nfs_minorversion_callback_svc_setup(serv, - &rqstp, &callback_svc); - } - - if (IS_ERR(rqstp)) - return PTR_ERR(rqstp); - - svc_sock_update_bufs(serv); + if (serv->sv_nrthreads-1 == nrservs) + return 0; - cb_info->serv = serv; - cb_info->rqst = rqstp; - cb_info->task = kthread_create(callback_svc, cb_info->rqst, - "nfsv4.%u-svc", minorversion); - if (IS_ERR(cb_info->task)) { - ret = PTR_ERR(cb_info->task); - svc_exit_thread(cb_info->rqst); - cb_info->rqst = NULL; - cb_info->task = NULL; + ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + if (ret) { + serv->sv_ops->svo_setup(serv, NULL, 0); return ret; } - rqstp->rq_task = cb_info->task; - wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } @@ -281,19 +217,41 @@ err_bind: return ret; } -static struct svc_serv_ops nfs_cb_sv_ops = { +static struct svc_serv_ops nfs40_cb_sv_ops = { + .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; +#if defined(CONFIG_NFS_V4_1) +static struct svc_serv_ops nfs41_cb_sv_ops = { + .svo_function = nfs41_callback_svc, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = &nfs41_cb_sv_ops, +}; +#else +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = NULL, }; +#endif static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; struct svc_serv *serv; + struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. */ - if (cb_info->task) { + if (cb_info->serv) { /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. @@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) return cb_info->serv; } + switch (minorversion) { + case 0: + sv_ops = nfs4_cb_sv_ops[0]; + break; + default: + sv_ops = nfs4_cb_sv_ops[1]; + } + + if (sv_ops == NULL) + return ERR_PTR(-ENOTSUPP); + /* * Sanity check: if there's no task, * we should be the first user ... @@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); } + cb_info->serv = serv; /* As there is only one thread we need to over-ride the * default maximum of 80 connections */ @@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) * thread exits. */ err_net: + if (!cb_info->users) + cb_info->serv = NULL; svc_destroy(serv); err_create: mutex_unlock(&nfs_callback_mutex); @@ -374,18 +346,18 @@ err_start: void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; mutex_lock(&nfs_callback_mutex); - nfs_callback_down_net(minorversion, cb_info->serv, net); + serv = cb_info->serv; + nfs_callback_down_net(minorversion, serv, net); cb_info->users--; - if (cb_info->users == 0 && cb_info->task != NULL) { - kthread_stop(cb_info->task); - dprintk("nfs_callback_down: service stopped\n"); - svc_exit_thread(cb_info->rqst); + if (cb_info->users == 0) { + svc_get(serv); + serv->sv_ops->svo_setup(serv, NULL, 0); + svc_destroy(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; - cb_info->rqst = NULL; - cb_info->task = NULL; } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 5fe1cec..c701c30 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify( struct cb_devicenotifyargs *args, void *dummy, struct cb_process_state *cps); +struct cb_notify_lock_args { + struct nfs_fh cbnl_fh; + struct nfs_lowner cbnl_owner; + bool cbnl_valid; +}; + +extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, + void *dummy, + struct cb_process_state *cps); #endif /* CONFIG_NFS_V4_1 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, @@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net); #define NFS41_BC_MIN_CALLBACKS 1 #define NFS41_BC_MAX_CALLBACKS 1 +#define NFS4_MIN_NR_CALLBACK_THREADS 1 + extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_nr_threads; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f953ef6..e9aa235e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -628,4 +628,20 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, + struct cb_process_state *cps) +{ + if (!cps->clp) /* set in cb_sequence */ + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + + dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + + return htonl(NFS4_OK); +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 656f68f..eb094c6 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -35,6 +35,7 @@ (1 + 3) * 4) // seqid, 3 slotids #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, return 0; } +static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 *p; + unsigned int len; + + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); + len = be32_to_cpu(*p); + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + /* Only try to decode if the length is right */ + if (len == 20) { + p += 2; /* skip "lock id:" */ + args->cbnl_owner.s_dev = be32_to_cpu(*p++); + xdr_decode_hyper(p, &args->cbnl_owner.id); + args->cbnl_valid = true; + } else { + args->cbnl_owner.s_dev = 0; + args->cbnl_owner.id = 0; + args->cbnl_valid = false; + } + return 0; +} + +static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->cbnl_fh); + if (unlikely(status != 0)) + goto out; + status = decode_lockowner(xdr, args); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL_SLOT: case OP_CB_LAYOUTRECALL: case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY_LOCK: *op = &callback_ops[op_nr]; break; @@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_WANTS_CANCELLED: - case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); default: @@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallslot_args, .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, }, + [OP_CB_NOTIFY_LOCK] = { + .process_op = (callback_process_op_t)nfs4_callback_notify_lock, + .decode_args = (callback_decode_arg_t)decode_notify_lock_args, + .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e10678..7555ba8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) - continue; + /* Match all xprt_switch full socket addresses */ + if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, + sap)) + continue; atomic_inc(&clp->cl_count); return clp; @@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; - fsinfo.layouttype = 0; + fsinfo.nlayouttypes = 0; + memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); - nn->boot_time = CURRENT_TIME; + nn->boot_time = ktime_get_real(); } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 322c258..dff600a 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static bool +nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t flags) +{ + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return true; + return false; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags && - !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (nfs4_is_valid_delegation(delegation, flags)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; @@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, rcu_read_unlock(); put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); - } else { - /* We appear to have raced with a delegation return. */ - spin_unlock(&delegation->lock); - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + return; } - } else { - rcu_read_unlock(); + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); } + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } -static void nfs_revoke_delegation(struct inode *inode) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; + nfs_mark_return_delegation(server, delegation); +} + +static bool nfs_revoke_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; + nfs4_stateid tmp; + bool ret = false; + rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - nfs_mark_return_delegation(NFS_SERVER(inode), delegation); - } + if (delegation == NULL) + goto out; + if (stateid == NULL) { + nfs4_stateid_copy(&tmp, &delegation->stateid); + stateid = &tmp; + } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) + goto out; + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + ret = true; +out: rcu_read_unlock(); + if (ret) + nfs_inode_find_state_and_recover(inode, stateid); + return ret; } -void nfs_remove_bad_delegation(struct inode *inode) +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; - nfs_revoke_delegation(inode); + if (!nfs_revoke_delegation(inode, stateid)) + return; delegation = nfs_inode_detach_delegation(inode); - if (delegation) { - nfs_inode_find_state_and_recover(inode, &delegation->stateid); + if (delegation) nfs_free_delegation(delegation); - } } EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); @@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) { struct nfs_delegation *delegation; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + /* + * If the delegation may have been admin revoked, then we + * cannot reclaim it. + */ + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) + continue; set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + } } /** @@ -851,6 +887,141 @@ restart: rcu_read_unlock(); } +static inline bool nfs4_server_rebooted(const struct nfs_client *clp) +{ + return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) | + BIT(NFS4CLNT_LEASE_EXPIRED) | + BIT(NFS4CLNT_SESSION_RESET))) != 0; +} + +static void nfs_mark_test_expired_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE) + return; + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); +} + +static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server, + struct inode *inode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + nfs_mark_test_expired_delegation(server, delegation); + rcu_read_unlock(); + +} + +static void nfs_delegation_mark_test_expired_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + nfs_mark_test_expired_delegation(server, delegation); +} + +/** + * nfs_mark_test_expired_all_delegations - mark all delegations for testing + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * marks them as needing to be checked for validity. + */ +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_test_expired_server(server); + rcu_read_unlock(); +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + const struct nfs4_minor_version_ops *ops = clp->cl_mvops; + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + struct rpc_cred *cred; + nfs4_stateid stateid; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + cred = get_rpccred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + if (cred != NULL && + ops->test_and_free_expired(server, &stateid, cred) < 0) { + nfs_revoke_delegation(inode, &stateid); + nfs_inode_find_state_and_recover(inode, &stateid); + } + put_rpccred(cred); + if (nfs4_server_rebooted(clp)) { + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + nfs_sb_deactive(server->super); + return; + } + iput(inode); + nfs_sb_deactive(server->super); + goto restart; + } + } + rcu_read_unlock(); +} + +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; + bool found = false; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation && + nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); + found = true; + } + rcu_read_unlock(); + if (found) + nfs4_schedule_state_manager(clp); +} + /** * nfs_delegations_present - check for existence of delegations * @clp: client state handle @@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - ret = (delegation != NULL && (delegation->type & flags) == flags); + ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 64724d2..e9d5557 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -32,6 +32,7 @@ enum { NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, + NFS_DELEGATION_TEST_EXPIRED, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); -void nfs_remove_bad_delegation(struct inode *inode); +void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_reap_expired_delegations(struct nfs_client *clp); + /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); @@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 177fefb..5f1af4c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 0; nfsi = NFS_I(inode); - if (entry->fattr->fileid == nfsi->fileid) - return 1; - if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) - return 1; - return 0; + if (entry->fattr->fileid != nfsi->fileid) + return 0; + if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) + return 0; + return 1; } static @@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; + if (filename.len == 0) + return; + /* Validate that the name doesn't contain any illegal '\0' */ + if (strnlen(filename.name, filename.len) != filename.len) + return; + /* ...or '/' */ + if (strnchr(filename.name, filename.len, '/')) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -517,6 +525,8 @@ again: &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { + if (!entry->fh->size) + goto out; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) @@ -529,6 +539,10 @@ again: goto again; } } + if (!entry->fh->size) { + d_lookup_done(dentry); + goto out; + } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); alias = d_splice_alias(inode, dentry); @@ -2013,7 +2027,8 @@ EXPORT_SYMBOL_GPL(nfs_link); * the rename. */ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -2021,6 +2036,9 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct rpc_task *task; int error = -EBUSY; + if (flags) + return -EINVAL; + dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", old_dentry, new_dentry, d_count(new_dentry)); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 72b7d13..bd81bcf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) dreq->iocb->ki_complete(dreq->iocb, res, 0); } - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_direct_req_release(dreq); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index ca699dd..9ea85ae 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -182,29 +182,6 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(nfs_file_read); -ssize_t -nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, size_t count, - unsigned int flags) -{ - struct inode *inode = file_inode(filp); - ssize_t res; - - dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", - filp, (unsigned long) count, (unsigned long long) *ppos); - - nfs_start_io_read(inode); - res = nfs_revalidate_mapping(inode, filp->f_mapping); - if (!res) { - res = generic_file_splice_read(filp, ppos, pipe, count, flags); - if (res > 0) - nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); - } - nfs_end_io_read(inode); - return res; -} -EXPORT_SYMBOL_GPL(nfs_file_splice_read); - int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -543,7 +520,9 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, +#ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, +#endif .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, @@ -708,11 +687,6 @@ out_noconflict: goto out; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) -{ - return locks_lock_file_wait(file, fl); -} - static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -745,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); return status; } @@ -770,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); if (status < 0) goto out; @@ -871,7 +845,7 @@ const struct file_operations nfs_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 51b5136..98ace12 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, NULL); case -NFS4ERR_OPENMODE: if (state == NULL) break; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74935a1..80bcc0b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -359,14 +359,13 @@ int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +int nfs_rename(struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); -ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, - size_t, unsigned int); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); @@ -535,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) } #endif - #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -#else -#define nfs_migrate_page NULL #endif static inline int @@ -563,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, @@ -572,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp, extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); +extern int nfs4_test_session_trunk(struct rpc_clnt *, + struct rpc_xprt *, + void *); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { @@ -681,11 +679,11 @@ unsigned int nfs_page_length(struct page *page) loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { - pgoff_t page_index = page_file_index(page); + pgoff_t index = page_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (page_index < end_index) + if (index < end_index) return PAGE_SIZE; - if (page_index == end_index) + if (index == end_index) return ((i_size - 1) & ~PAGE_MASK) + 1; } return 0; diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index f0e06e4..fbce0d8 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,7 +29,7 @@ struct nfs_net { int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; - struct timespec boot_time; + ktime_t boot_time; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 698be93..dc925b5 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -899,9 +899,6 @@ static const struct inode_operations nfs3_dir_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .getxattr = generic_getxattr, - .setxattr = generic_setxattr, - .removexattr = generic_removexattr, .get_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif @@ -913,9 +910,6 @@ static const struct inode_operations nfs3_file_inode_operations = { .setattr = nfs_setattr, #ifdef CONFIG_NFS_V3_ACL .listxattr = nfs3_listxattr, - .getxattr = generic_getxattr, - .setxattr = generic_setxattr, - .removexattr = generic_removexattr, .get_acl = nfs3_get_acl, .set_acl = nfs3_set_acl, #endif diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 64b43b4..6085019 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); + rpc_put_task(task); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9bf64ea..9b3a82a 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -39,6 +39,7 @@ enum nfs4_client_state { NFS4CLNT_BIND_CONN_TO_SESSION, NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, + NFS4CLNT_DELEGATION_EXPIRED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -57,8 +58,11 @@ struct nfs4_minor_version_ops { struct nfs_fsinfo *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + int (*test_and_free_expired)(struct nfs_server *, + nfs4_stateid *, struct rpc_cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -156,6 +160,7 @@ enum { NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ + NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ }; struct nfs4_state { @@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops { struct rpc_cred *); }; +struct nfs4_add_xprt_data { + struct nfs_client *clp; + struct rpc_cred *cred; +}; + struct nfs4_state_maintenance_ops { int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); @@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); +extern int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt); static inline bool is_ds_only_client(struct nfs_client *clp) @@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern int nfs4_schedule_migration_recovery(const struct nfs_server *); extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); -extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); @@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4; struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; +extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index cd3b7cf..074ac71 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: @@ -562,15 +565,15 @@ out: /* * Returns true if the client IDs match */ -static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +static bool nfs4_match_clientids(u64 a, u64 b) { - if (a->cl_clientid != b->cl_clientid) { + if (a != b) { dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return false; } dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return true; } @@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) * Returns true if the server major ids match */ static bool -nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) +nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) { - struct nfs41_server_owner *o1 = a->cl_serverowner; - struct nfs41_server_owner *o2 = b->cl_serverowner; - if (o1->major_id_sz != o2->major_id_sz) goto out_major_mismatch; if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) goto out_major_mismatch; - dprintk("NFS: --> %s server owners match\n", __func__); + dprintk("NFS: --> %s server owner major IDs match\n", __func__); return true; out_major_mismatch: @@ -597,6 +598,100 @@ out_major_mismatch: return false; } +/* + * Returns true if server minor ids match + */ +static bool +nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) +{ + /* Check eir_server_owner so_minor_id */ + if (o1->minor_id != o2->minor_id) + goto out_minor_mismatch; + + dprintk("NFS: --> %s server owner minor IDs match\n", __func__); + return true; + +out_minor_mismatch: + dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); + return false; +} + +/* + * Returns true if the server scopes match + */ +static bool +nfs4_check_server_scope(struct nfs41_server_scope *s1, + struct nfs41_server_scope *s2) +{ + if (s1->server_scope_sz != s2->server_scope_sz) + goto out_scope_mismatch; + if (memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) != 0) + goto out_scope_mismatch; + + dprintk("NFS: --> %s server scopes match\n", __func__); + return true; + +out_scope_mismatch: + dprintk("NFS: --> %s server scopes do not match\n", + __func__); + return false; +} + +/** + * nfs4_detect_session_trunking - Checks for session trunking. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. + * + * @clp: original mount nfs_client + * @res: result structure from an exchange_id using the original mount + * nfs_client with a new multi_addr transport + * + * Returns zero on success, otherwise -EINVAL + * + * Note: since the exchange_id for the new multi_addr transport uses the + * same nfs_client from the original mount, the cl_owner_id is reused, + * so eir_clientowner is the same. + */ +int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, + struct rpc_xprt *xprt) +{ + /* Check eir_clientid */ + if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + goto out_err; + + /* Check eir_server_owner so_major_id */ + if (!nfs4_check_serverowner_major_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_owner so_minor_id */ + if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_scope */ + if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) + goto out_err; + + /* Session trunking passed, add the xprt */ + rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); + + pr_info("NFS: %s: Session trunking succeeded for %s\n", + clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return 0; +out_err: + pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return -EINVAL; +} + /** * nfs41_walk_client_list - Find nfs_client that matches a client/server owner * @@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (!nfs4_match_clientids(pos, new)) + if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) continue; /* @@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new, * client id trunking. In either case, we want to fall back * to using the existing nfs_client. */ - if (!nfs4_check_clientid_trunking(pos, new)) + if (!nfs4_check_serverowner_major_id(pos->cl_serverowner, + new->cl_serverowner)) continue; /* Unlike NFSv4.0, we know that NFSv4.1 always uses the diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d085ad7..89a7795 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -248,7 +248,7 @@ const struct file_operations nfs4_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = nfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a9dec32..7897826 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, - struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, + struct rpc_cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_test_and_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + + ops->test_and_free_expired(server, stateid, cred); +} + +static void __nfs4_free_revoked_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + stateid->type = NFS4_REVOKED_STATEID_TYPE; + nfs4_test_and_free_stateid(server, stateid, cred); +} + +static void nfs4_free_revoked_stateid(struct nfs_server *server, + const nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + nfs4_stateid tmp; + + nfs4_stateid_copy(&tmp, stateid); + __nfs4_free_revoked_stateid(server, &tmp, cred); +} + static long nfs4_update_delay(long *timeout) { long ret; @@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->delay = 0; exception->recovering = 0; exception->retry = 0; + + if (stateid == NULL && state != NULL) + stateid = &state->stateid; + switch(errorcode) { case 0: return 0; - case -NFS4ERR_OPENMODE: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && stateid != NULL) { + nfs_inode_find_state_and_recover(inode, + stateid); + goto wait_on_recovery; + } + case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server, if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_EXPIRED: - if (state != NULL) { - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; @@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* If previous op on slot was interrupted and we reused + * the seq# and got a reply from the cache, then retry + */ + if (task->tk_status == -EREMOTEIO && interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, + !!slot->privileged); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; dprintk("<-- %s slotid=%u seqid=%u\n", __func__, @@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } +#ifdef CONFIG_NFS_V4_1 +static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) +{ + if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags)) + return true; + if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags)) + return true; + if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags)) + return true; + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) } static bool nfs_need_update_open_stateid(struct nfs4_state *state, - nfs4_stateid *stateid) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) return true; if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); nfs_test_and_clear_all_open_stateid(state); return true; } @@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_schedule_state_manager(state->owner->so_server->nfs_client); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_set_open_stateid_locked(struct nfs4_state *state, + const nfs4_stateid *stateid, fmode_t fmode, + nfs4_stateid *freeme) { switch (fmode) { case FMODE_READ: @@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (!nfs_need_update_open_stateid(state, stateid)) + if (!nfs_need_update_open_stateid(state, stateid, freeme)) return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); } -static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +static void __update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode, + nfs4_stateid *freeme) { /* * Protect the call to nfs4_state_set_mode_locked and @@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode); + nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); write_sequnlock(&state->seqlock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +static int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *delegation, + fmode_t fmode) { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; + nfs4_stateid freeme = { }; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); @@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, + fmode, &freeme); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); @@ -1508,11 +1576,14 @@ no_delegation: rcu_read_unlock(); if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode); + __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); ret = 1; } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) - nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + nfs4_schedule_state_manager(clp); + if (freeme.type != 0) + nfs4_test_and_free_stateid(server, &freeme, + state->owner->so_cred); return ret; } @@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: nfs_inode_find_state_and_recover(state->inode, @@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) { - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, stateid); write_seqlock(&state->seqlock); nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); @@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) static void nfs40_clear_delegation_stateid(struct nfs4_state *state) { if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) - nfs_finish_clear_delegation_stateid(state); + nfs_finish_clear_delegation_stateid(state, NULL); } static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st return nfs4_open_expired(sp, state); } +static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + return -NFS4ERR_BAD_STATEID; +} + #if defined(CONFIG_NFS_V4_1) +static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + + switch (stateid->type) { + default: + break; + case NFS4_INVALID_STATEID_TYPE: + case NFS4_SPECIAL_STATEID_TYPE: + return -NFS4ERR_BAD_STATEID; + case NFS4_REVOKED_STATEID_TYPE: + goto out_free; + } + + status = nfs41_test_stateid(server, stateid, cred); + switch (status) { + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + break; + default: + return status; + } +out_free: + /* Ack the revoked state to the server */ + nfs41_free_stateid(server, stateid, cred, true); + return -NFS4ERR_EXPIRED; +} + static void nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + rcu_read_unlock(); + nfs_finish_clear_delegation_stateid(state, &stateid); + return; + } + + if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + rcu_read_unlock(); + return; + } + cred = get_rpccred(delegation->cred); rcu_read_unlock(); - status = nfs41_test_stateid(server, &stateid, cred); + status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); - - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, &stateid, cred); - nfs_finish_clear_delegation_stateid(state); - } + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_finish_clear_delegation_stateid(state, &stateid); put_rpccred(cred); } /** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = NFS_OK; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + if (!test_bit(LK_STATE_IN_USE, &state->flags)) + goto out; + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_and_free_expired_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status == -NFS4ERR_EXPIRED || + status == -NFS4ERR_BAD_STATEID) { + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE; + if (!recover_lost_locks) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } else if (status != NFS_OK) { + ret = status; + break; + } + } + }; +out: + return ret; +} + +/** * nfs41_check_open_stateid - possibly free an open stateid * * @state: NFSv4 state for an inode @@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) struct rpc_cred *cred = state->owner->so_cred; int status; - /* If a state reset has been done, test_stateid is unneeded */ - if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { + if (nfs4_have_delegation(state->inode, state->state)) + return NFS_OK; + return -NFS4ERR_OPENMODE; + } return -NFS4ERR_BAD_STATEID; - - status = nfs41_test_stateid(server, stateid, cred); + } + status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid, cred); - + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); + stateid->type = NFS4_INVALID_STATEID_TYPE; } - return status; + if (status != NFS_OK) + return status; + if (nfs_open_stateid_recover_openmode(state)) + return -NFS4ERR_OPENMODE; + return NFS_OK; } static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st int status; nfs41_check_delegation_stateid(state); + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + return status; status = nfs41_check_open_stateid(state); if (status != NFS_OK) status = nfs4_open_expired(sp, state); @@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) + set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &state->open_stateid)) { rpc_restart_call_prepare(task); @@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; @@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) -{ - nfs_invalidate_atime(hdr->inode); -} - static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct nfs_server *server = NFS_SERVER(hdr->inode); trace_nfs4_read(hdr, task->tk_status); - if (nfs4_async_handle_error(task, server, - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } - __nfs4_read_done_cb(hdr); if (task->tk_status > 0) renew_lease(server, hdr->timestamp); return 0; @@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (task->tk_status > 0) + nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } @@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct inode *inode = hdr->inode; trace_nfs4_write(hdr, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + NFS_SERVER(inode), task->tk_status, + &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); @@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { /* An impossible timestamp guarantees this value * will never match a generated boot time. */ - verf[0] = 0; - verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + verf[0] = cpu_to_be32(U32_MAX); + verf[1] = cpu_to_be32(U32_MAX); } else { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - verf[0] = cpu_to_be32(nn->boot_time.tv_sec); - verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + u64 ns = ktime_to_ns(nn->boot_time); + + verf[0] = cpu_to_be32(ns >> 32); + verf[1] = cpu_to_be32(ns); } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) renew_lease(data->res.server, data->timestamp); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(data->res.server, + data->args.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: task->tk_status = 0; if (data->roc) pnfs_roc_set_barrier(data->inode, data->roc_barrier); @@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 return err; } -#define NFS4_LOCK_MINTIMEOUT (1 * HZ) -#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) - -/* - * sleep, with exponential backoff, and retry the LOCK operation. - */ -static unsigned long -nfs4_set_lock_task_retry(unsigned long timeout) -{ - freezable_schedule_timeout_killable_unsafe(timeout); - timeout <<= 1; - if (timeout > NFS4_LOCK_MAXTIMEOUT) - return NFS4_LOCK_MAXTIMEOUT; - return timeout; -} - static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct inode *inode = state->inode; @@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct inode *inode, struct file_lock *fl) -{ - return locks_lock_inode_wait(inode, fl); -} - struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); + locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(calldata->server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &calldata->lsp->ls_stateid)) rpc_restart_call_prepare(task); @@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(inode, request) == -ENOENT) { + if (locks_lock_inode_wait(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ { switch (error) { case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || @@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ break; case -NFS4ERR_STALE_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -6083,52 +6247,19 @@ out: } #if defined(CONFIG_NFS_V4_1) -/** - * nfs41_check_expired_locks - possibly free a lock stateid - * - * @state: NFSv4 state for an inode - * - * Returns NFS_OK if recovery for this stateid is now finished. - * Otherwise a negative NFS4ERR value is returned. - */ -static int nfs41_check_expired_locks(struct nfs4_state *state) -{ - int status, ret = -NFS4ERR_BAD_STATEID; - struct nfs4_lock_state *lsp; - struct nfs_server *server = NFS_SERVER(state->inode); - - list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - - status = nfs41_test_stateid(server, - &lsp->ls_stateid, - cred); - trace_nfs4_test_lock_stateid(state, lsp, status); - if (status != NFS_OK) { - /* Free the stateid unless the server - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, - &lsp->ls_stateid, - cred); - clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - ret = status; - } - } - }; - - return ret; -} - static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) { - int status = NFS_OK; + struct nfs4_lock_state *lsp; + int status; - if (test_bit(LK_STATE_IN_USE, &state->flags)) - status = nfs41_check_expired_locks(state); - if (status != NFS_OK) - status = nfs4_lock_expired(state, request); + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || + test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + return 0; + status = nfs4_lock_expired(state, request); return status; } #endif @@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; - int status = -ENOLCK; + int status; - if ((fl_flags & FL_POSIX) && - !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) - goto out; - /* Is this a delegated open? */ - status = nfs4_set_lock_state(state, request); - if (status != 0) - goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; mutex_lock(&sp->so_delegreturn_mutex); @@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +static int +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + freezable_schedule_timeout_interruptible(timeout); + timeout *= 2; + timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); + status = -ERESTARTSYS; + } + return status; +} + +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { struct nfs_open_context *ctx; struct nfs4_state *state; - unsigned long timeout = NFS4_LOCK_MINTIMEOUT; int status; /* verify open state */ @@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + + if ((request->fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + return -ENOLCK; + /* * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. @@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return -EBADF; } - do { - status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) - break; - timeout = nfs4_set_lock_task_retry(timeout); - status = -ERESTARTSYS; - if (signalled()) - break; - } while(status < 0); - return status; + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + + return nfs4_retry_setlk(state, cmd, request); } int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) @@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, return 0; } +struct nfs41_exchange_id_data { + struct nfs41_exchange_id_res res; + struct nfs41_exchange_id_args args; + struct rpc_xprt *xprt; + int rpc_status; +}; + +static void nfs4_exchange_id_done(struct rpc_task *task, void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + struct nfs_client *clp = cdata->args.client; + int status = task->tk_status; + + trace_nfs4_exchange_id(clp, status); + + if (status == 0) + status = nfs4_check_cl_exchange_flags(cdata->res.flags); + + if (cdata->xprt && status == 0) { + status = nfs4_detect_session_trunking(clp, &cdata->res, + cdata->xprt); + goto out; + } + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); + + if (status == 0) { + clp->cl_clientid = cdata->res.clientid; + clp->cl_exchange_flags = cdata->res.flags; + /* Client ID is not confirmed */ + if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + clp->cl_seqid = cdata->res.seqid; + } + + kfree(clp->cl_serverowner); + clp->cl_serverowner = cdata->res.server_owner; + cdata->res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = cdata->res.impl_id; + cdata->res.impl_id = NULL; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + cdata->res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = cdata->res.server_scope; + cdata->res.server_scope = NULL; + } + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + sizeof(clp->cl_confirm.data)); + } +out: + cdata->rpc_status = status; + return; +} + +static void nfs4_exchange_id_release(void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + + nfs_put_client(cdata->args.client); + if (cdata->xprt) { + xprt_put(cdata->xprt); + rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); + } + kfree(cdata->res.impl_id); + kfree(cdata->res.server_scope); + kfree(cdata->res.server_owner); + kfree(cdata); +} + +static const struct rpc_call_ops nfs4_exchange_id_call_ops = { + .rpc_call_done = nfs4_exchange_id_done, + .rpc_release = nfs4_exchange_id_release, +}; + /* * _nfs4_proc_exchange_id() * * Wrapper for EXCHANGE_ID operation. */ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, - u32 sp4_how) + u32 sp4_how, struct rpc_xprt *xprt) { nfs4_verifier verifier; - struct nfs41_exchange_id_args args = { - .verifier = &verifier, - .client = clp, -#ifdef CONFIG_NFS_V4_1_MIGRATION - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, -#endif - }; - struct nfs41_exchange_id_res res = { - 0 - }; - int status; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], - .rpc_argp = &args, - .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .callback_ops = &nfs4_exchange_id_call_ops, + .rpc_message = &msg, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + struct nfs41_exchange_id_data *calldata; + struct rpc_task *task; + int status = -EIO; + + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out; + + status = -ENOMEM; + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (!calldata) + goto out; - nfs4_init_boot_verifier(clp, &verifier); + if (!xprt) + nfs4_init_boot_verifier(clp, &verifier); status = nfs4_init_uniform_client_string(clp); if (status) - goto out; + goto out_calldata; dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), - GFP_NOFS); - if (unlikely(res.server_owner == NULL)) { - status = -ENOMEM; - goto out; - } + calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + status = -ENOMEM; + if (unlikely(calldata->res.server_owner == NULL)) + goto out_calldata; - res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), GFP_NOFS); - if (unlikely(res.server_scope == NULL)) { - status = -ENOMEM; + if (unlikely(calldata->res.server_scope == NULL)) goto out_server_owner; - } - res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); - if (unlikely(res.impl_id == NULL)) { - status = -ENOMEM; + calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(calldata->res.impl_id == NULL)) goto out_server_scope; - } switch (sp4_how) { case SP4_NONE: - args.state_protect.how = SP4_NONE; + calldata->args.state_protect.how = SP4_NONE; break; case SP4_MACH_CRED: - args.state_protect = nfs4_sp4_mach_cred_request; + calldata->args.state_protect = nfs4_sp4_mach_cred_request; break; default: @@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, status = -EINVAL; goto out_impl_id; } + if (xprt) { + calldata->xprt = xprt; + task_setup_data.rpc_xprt = xprt; + task_setup_data.flags = + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; + calldata->args.verifier = &clp->cl_confirm; + } else { + calldata->args.verifier = &verifier; + } + calldata->args.client = clp; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - trace_nfs4_exchange_id(clp, status); - if (status == 0) - status = nfs4_check_cl_exchange_flags(res.flags); - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &res.state_protect); - - if (status == 0) { - clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = res.flags; - /* Client ID is not confirmed */ - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - clp->cl_seqid = res.seqid; - } - - kfree(clp->cl_serverowner); - clp->cl_serverowner = res.server_owner; - res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = res.impl_id; - res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = res.server_scope; - res.server_scope = NULL; - } + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_impl_id; } -out_impl_id: - kfree(res.impl_id); -out_server_scope: - kfree(res.server_scope); -out_server_owner: - kfree(res.server_owner); + if (!xprt) { + status = rpc_wait_for_completion_task(task); + if (!status) + status = calldata->rpc_status; + } else /* session trunking test */ + status = calldata->rpc_status; + + rpc_put_task(task); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7240,6 +7548,16 @@ out: clp->cl_implid->date.nseconds); dprintk("NFS reply exchange_id: %d\n", status); return status; + +out_impl_id: + kfree(calldata->res.impl_id); +out_server_scope: + kfree(calldata->res.server_scope); +out_server_owner: + kfree(calldata->res.server_owner); +out_calldata: + kfree(calldata); + goto out; } /* @@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); +} + +/** + * nfs4_test_session_trunk + * + * This is an add_xprt_test() test function called from + * rpc_clnt_setup_test_and_add_xprt. + * + * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt + * and is dereferrenced in nfs4_exchange_id_release + * + * Upon success, add the new transport to the rpc_clnt + * + * @clnt: struct rpc_clnt to get new transport + * @xprt: the rpc_xprt to test + * @data: call data for _nfs4_proc_exchange_id. + */ +int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, + void *data) +{ + struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + u32 sp4_how; + + dprintk("--> %s try %s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); + + /* Test connection for session trunking. Async exchange_id call */ + return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); } +EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; - args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; + args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1); dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; - /* These would render the backchannel useless: */ - if (rcvd->max_ops != sent->max_ops) + if (rcvd->max_ops > sent->max_ops) return -EINVAL; - if (rcvd->max_reqs != sent->max_reqs) + if (rcvd->max_reqs > sent->max_reqs) return -EINVAL; out: return 0; @@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_RECALLCONFLICT: status = -ERECALLCONFLICT; break; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; @@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; + exception->stateid = &lgp->args.stateid; break; } @@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server, return -res.status; } +static void nfs4_handle_delay_or_session_error(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + exception->retry = 0; + switch(err) { + case -NFS4ERR_DELAY: + case -NFS4ERR_RETRY_UNCACHED_REP: + nfs4_handle_exception(server, err, exception); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_do_handle_exception(server, err, exception); + } +} + /** * nfs41_test_stateid - perform a TEST_STATEID operation * @@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server, int err; do { err = _nfs41_test_stateid(server, stateid, cred); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + nfs4_handle_delay_or_session_error(server, err, &exception); } while (exception.retry); return err; } @@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { }; static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, struct rpc_cred *cred, bool privileged) { @@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); if (privileged) nfs4_set_sequence_privileged(&data->args.seq_args); @@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential + * @is_recovery: set to true if this call needs to be privileged * - * Returns NFS_OK if the server freed "stateid". Otherwise a - * negative NFS4ERR value is returned. + * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + struct rpc_cred *cred, + bool is_recovery) { struct rpc_task *task; - int ret; - task = _nfs41_free_stateid(server, stateid, cred, true); + task = _nfs41_free_stateid(server, stateid, cred, is_recovery); if (IS_ERR(task)) return PTR_ERR(task); - ret = rpc_wait_for_completion_task(task); - if (!ret) - ret = task->tk_status; rpc_put_task(task); - return ret; + return 0; } static void nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct rpc_task *task; struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); - if (IS_ERR(task)) - return; - rpc_put_task(task); } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .test_and_free_expired = nfs40_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_seqid, .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, @@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, .call_sync_ops = &nfs41_call_sync_ops, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, @@ -8941,20 +9306,14 @@ static const struct inode_operations nfs4_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .getxattr = generic_getxattr, - .setxattr = generic_setxattr, .listxattr = nfs4_listxattr, - .removexattr = generic_removexattr, }; static const struct inode_operations nfs4_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .getxattr = generic_getxattr, - .setxattr = generic_setxattr, .listxattr = nfs4_listxattr, - .removexattr = generic_removexattr, }; const struct nfs_rpc_ops nfs_v4_clientops = { diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index f703b75..dae3855 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -9,6 +9,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -22,6 +23,7 @@ struct nfs4_slot { u32 slot_nr; u32 seq_nr; unsigned int interrupted : 1, + privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cada00a..5f4281e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, { int ret; + if (!nfs4_valid_open_stateid(state)) + return -EIO; if (cred != NULL) *cred = NULL; ret = nfs4_copy_lock_stateid(dst, state, lockowner); @@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp) static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); /* Don't recover state that expired before the reboot */ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { @@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); @@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ { struct nfs_client *clp = server->nfs_client; - if (!nfs4_valid_open_stateid(state)) + if (!nfs4_state_mark_reclaim_nograce(clp, state)) return -EBADF; - nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); @@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); +static struct nfs4_lock_state * +nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + struct nfs4_lock_state *pos; + + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) + continue; + if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + return pos; + } + return NULL; +} + +static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + bool found = false; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) { + spin_lock(&state->state_lock); + if (nfs_state_find_lock_state_by_stateid(state, stateid)) + found = true; + spin_unlock(&state->state_lock); + } + return found; +} + void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid) { @@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + if (nfs4_stateid_match_other(&state->stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; continue; - if (!nfs4_stateid_match(&state->stateid, stateid)) - continue; - nfs4_state_mark_reclaim_nograce(clp, state); - found = true; + } + if (nfs_state_lock_state_matches_stateid(state, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) + found = true; } spin_unlock(&inode->i_lock); + + nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) nfs4_schedule_state_manager(clp); } @@ -1498,6 +1536,9 @@ restart: __func__, status); case -ENOENT: case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: case -ESTALE: /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); @@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) put_rpccred(cred); } -static void nfs_delegation_clear_all(struct nfs_client *clp) -{ - nfs_delegation_mark_reclaim(clp); - nfs_delegation_reap_unclaimed(clp); -} - static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) { - nfs_delegation_clear_all(clp); + nfs_mark_test_expired_all_delegations(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } @@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp) static void nfs41_handle_some_state_revoked(struct nfs_client *clp) { - nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_state_start_reclaim_nograce(clp); nfs4_schedule_state_manager(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); @@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, + bool recovery) { if (!flags) return; dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", __func__, clp->cl_hostname, clp->cl_clientid, flags); + /* + * If we're called from the state manager thread, then assume we're + * already handling the RECLAIM_NEEDED and/or STATE_REVOKED. + * Those flags are expected to remain set until we're done + * recovering (see RFC5661, section 18.46.3). + */ + if (recovery) + goto out_recovery; if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); @@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); +out_recovery: if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) nfs41_handle_backchannel_fault(clp); else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | @@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_state_end_reclaim_reboot(clp); } + /* Detect expired delegations... */ + if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { + section = "detect expired delegations"; + nfs_reap_expired_delegations(clp); + continue; + } + /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7bd3a5c..fc89e5e 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, } /* - * Decode potentially multiple layout types. Currently we only support - * one layout driver per file system. + * Decode potentially multiple layout types. */ -static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, - uint32_t *layouttype) +static int decode_pnfs_layout_types(struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) { __be32 *p; - int num; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { - *layouttype = 0; + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > 1) - printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " - "drivers per filesystem not supported\n", __func__); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - *layouttype = be32_to_cpup(p); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4764,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_first_pnfs_layout_type(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; - } else - *layouttype = 0; + } return status; } @@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 174dd4c..965db47 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * update_nfs_request below if the region is not locked. */ req->wb_page = page; if (page) { - req->wb_index = page_file_index(page); + req->wb_index = page_index(page); get_page(page); } req->wb_offset = offset; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c93a85..56b2d96 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/sort.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) } /* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + +/* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. * - * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + * @ids array of layout types supported by MDS. */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 id) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; + u32 id; + int i; - if (id == 0) - goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", - __func__, id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", + __func__, server->nfs_client->cl_exchange_flags); goto out_no_driver; } - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); + + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", - __func__, id); - goto out_no_driver; + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); } + if (ld_type) + break; + } + + if (!ld_type) { + dprintk("%s: No pNFS module found!\n", __func__); + goto out_no_driver; } + server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { @@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(hdr); + if (likely(!hdr->pnfs_error)) hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_read_error(hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 31d99b2..5c29551 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 id) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f3468b5..53b4705 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); - if (!IS_ERR(clp)) - break; + if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { + struct xprt_create xprt_args = { + .ident = XPRT_TRANSPORT_TCP, + .net = clp->cl_net, + .dstaddr = (struct sockaddr *)&da->da_addr, + .addrlen = da->da_addrlen, + .servername = clp->cl_hostname, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + .cred = nfs4_get_clid_cred(clp), + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + + /** + * Test this address for session trunking and + * add as an alias + */ + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_rpccred(xprtdata.cred); + } else { + clp = nfs4_set_ds_client(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (IS_ERR(clp)) + continue; + + status = nfs4_init_ds_session(clp, + mds_srv->nfs_client->cl_lease_time); + if (status) { + nfs_put_client(clp); + clp = ERR_PTR(-EIO); + continue; + } + + } } if (IS_ERR(clp)) { @@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, goto out; } - status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); - if (status) - goto out_put; - smp_wmb(); ds->ds_clp = clp; dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; -out_put: - nfs_put_client(clp); - goto out; } /* diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 572e5b3..defc923 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page) int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_SIZE, page_file_index(page)); + page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d396013..001796b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2848,19 +2848,23 @@ out_invalid_transport_udp: * NFS client for backwards compatibility */ unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_nr_threads; /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600; /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); @@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = { #define param_check_portnr(name, p) __param_check(name, p, unsigned int); module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); +MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be " + "assigned to the NFSv4 callback channels."); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, @@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); +module_param(max_session_cb_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " + "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3a6724c..5321183 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -151,7 +151,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_file_index(page) < end_index) + if (i_size > 0 && page_index(page) < end_index) goto out; end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) @@ -603,7 +603,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, { int ret; - nfs_pageio_cond_complete(pgio, page_file_index(page)); + nfs_pageio_cond_complete(pgio, page_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, launder); if (ret == -EAGAIN) { diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 9d46a0b..62469c6 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i])) + gi->gid[i] = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + gi->gid[i] = rqgi->gid[i]; } } else { gi = get_group_info(rqgi); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 5a17084..0780ff8 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -123,7 +123,7 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, if (lcp->lc_mtime.tv_nsec == UTIME_NOW || timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) - lcp->lc_mtime = current_fs_time(inode->i_sb); + lcp->lc_mtime = current_time(inode); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index df880e9..b672873 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .disable_recalls = true, .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 5fbf3bb..b10d557 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,7 @@ struct nfsd_net { struct list_head client_lru; struct list_head close_lru; struct list_head del_recall_lru; + struct list_head blocked_locks_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 04c68d9..211dc2a 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, { int status; - if (cb->cb_minorversion == 0) + if (cb->cb_clp->cl_minorversion == 0) return 0; status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); @@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfs4_cb_compound_hdr hdr = { .ident = 0, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, } #endif /* CONFIG_NFSD_PNFS */ +static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); + p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); + xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); +} + +static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfsd4_blocked_lock *nbl = + container_of(cb, struct nfsd4_blocked_lock, nbl_cb); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + __be32 *p; + + BUG_ON(hdr.minorversion == 0); + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); + encode_nfs_fh4(xdr, &nbl->nbl_fh); + encode_stateowner(xdr, &lo->lo_owner); + hdr.nops++; + + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); +} + /* * RPC procedure tables */ @@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = { #ifdef CONFIG_NFSD_PNFS PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif + PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), }; static struct rpc_version nfs_cb_version4 = { @@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; - cb->cb_minorversion = minorversion; /* * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2be9602..42aace4 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); fput(ls->ls_file); if (ls->ls_recalled) @@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) struct file_lock *fl; int status; + if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + return 0; + fl = locks_alloc_lock(); if (!fl) return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1fb2227..abb09b5 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 -nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_clone *clone) +nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + stateid_t *src_stateid, struct file **src, + stateid_t *dst_stateid, struct file **dst) { - struct file *src, *dst; __be32 status; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - &clone->cl_src_stateid, RD_STATE, - &src, NULL); + src_stateid, RD_STATE, src, NULL); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - &clone->cl_dst_stateid, WR_STATE, - &dst, NULL); + dst_stateid, WR_STATE, dst, NULL); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(src)->i_mode) || - !S_ISREG(file_inode(dst)->i_mode)) { + if (!S_ISREG(file_inode(*src)->i_mode) || + !S_ISREG(file_inode(*dst)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } +out: + return status; +out_put_dst: + fput(*dst); +out_put_src: + fput(*src); + goto out; +} + +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, + &clone->cl_dst_stateid, &dst); + if (status) + goto out; + status = nfsd4_clone_file_range(src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count); -out_put_dst: fput(dst); -out_put_src: fput(src); out: return status; } static __be32 +nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + struct file *src, *dst; + __be32 status; + ssize_t bytes; + + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, + ©->cp_dst_stateid, &dst); + if (status) + goto out; + + bytes = nfsd_copy_file_range(src, copy->cp_src_pos, + dst, copy->cp_dst_pos, copy->cp_count); + + if (bytes < 0) + status = nfserrno(bytes); + else { + copy->cp_res.wr_bytes_written = bytes; + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_consecutive = 1; + copy->cp_synchronous = 1; + gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfs_ok; + } + + fput(src); + fput(dst); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { @@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* wr_callback */ + + op_encode_stateid_maxsz /* wr_callback */ + + 2 /* wr_count */ + + 1 /* wr_committed */ + + op_encode_verifier_maxsz + + 1 /* cr_consecutive */ + + 1 /* cr_synchronous */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS /* * At this stage we don't really know what layout driver will handle the request, @@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_CLONE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_COPY] = { + .op_func = (nfsd4op_func)nfsd4_copy, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_COPY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a204d7e..9752beb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } +static struct nfsd4_blocked_lock * +find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *cur, *found = NULL; + + spin_lock(&nn->client_lock); + list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { + if (fh_match(fh, &cur->nbl_fh)) { + list_del_init(&cur->nbl_list); + list_del_init(&cur->nbl_lru); + found = cur; + break; + } + } + spin_unlock(&nn->client_lock); + if (found) + posix_unblock_lock(&found->nbl_lock); + return found; +} + +static struct nfsd4_blocked_lock * +find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = find_blocked_lock(lo, fh, nn); + if (!nbl) { + nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + if (nbl) { + fh_copy_shallow(&nbl->nbl_fh, fh); + locks_init_lock(&nbl->nbl_lock); + nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, + &nfsd4_cb_notify_lock_ops, + NFSPROC4_CLNT_CB_NOTIFY_LOCK); + } + } + return nbl; +} + +static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_release_private(&nbl->nbl_lock); + kfree(nbl); +} + +static int +nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + /* + * Since this is just an optimization, we don't try very hard if it + * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and + * just quit trying on anything else. + */ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 1 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + + free_blocked_lock(nbl); +} + +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .done = nfsd4_cb_notify_lock_done, + .release = nfsd4_cb_notify_lock_release, +}; + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -1903,7 +1983,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; i<g1->ngroups; i++) - if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) + if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } @@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, goto out; /* cases below refer to rfc 3530 section 14.2.34: */ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { - if (conf && !unconf) /* case 2: probable retransmit */ + if (conf && same_verf(&confirm, &conf->cl_confirm)) { + /* case 2: probable retransmit */ status = nfs_ok; - else /* case 4: client hasn't noticed we rebooted yet? */ + } else /* case 4: client hasn't noticed we rebooted yet? */ status = nfserr_stale_clientid; goto out; } @@ -4410,9 +4491,11 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && - !nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; + else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) @@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; @@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn) } spin_unlock(&nn->client_lock); + /* + * It's possible for a client to try and acquire an already held lock + * that is being held for a long time, and then lose interest in it. + * So, we clean out any un-revisited request after a lease period + * under the assumption that the client is no longer interested. + * + * RFC5661, sec. 9.6 states that the client must not rely on getting + * notifications and must continue to poll for locks, even when the + * server supports them. Thus this shouldn't lead to clients blocking + * indefinitely once the lock does become free. + */ + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + if (time_after((unsigned long)nbl->nbl_time, + (unsigned long)cutoff)) { + t = nbl->nbl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); return new_timeo; } @@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +static void +nfsd4_lm_notify(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct net *net = lo->lo_owner.so_client->net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl = container_of(fl, + struct nfsd4_blocked_lock, nbl_lock); + bool queue = false; + + /* An empty list means that something else is going to be using it */ + spin_lock(&nn->client_lock); + if (!list_empty(&nbl->nbl_list)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + queue = true; + } + spin_unlock(&nn->client_lock); + + if (queue) + nfsd4_run_cb(&nbl->nbl_cb); +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_fl_get_owner, .lm_put_owner = nfsd4_fl_put_owner, }; @@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; + INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; @@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct file *filp = NULL; + struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; int lkflg; int err; bool new = false; + unsigned char fl_type; + unsigned int fl_flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; - file_lock = locks_alloc_lock(); - if (!file_lock) { - dprintk("NFSD: %s: unable to allocate lock!\n", __func__); - status = nfserr_jukebox; - goto out; - } - fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { - case NFS4_READ_LT: case NFS4_READW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_READ_LT: spin_lock(&fp->fi_lock); filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_RDLCK; + fl_type = F_RDLCK; break; - case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_WRLCK; + fl_type = F_WRLCK; break; default: status = nfserr_inval; goto out; } + if (!filp) { status = nfserr_openmode; goto out; } + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); + if (!nbl) { + dprintk("NFSD: %s: unable to allocate block!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + file_lock = &nbl->nbl_lock; + file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; - file_lock->fl_flags = FL_POSIX; + file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + if (fl_flags & FL_SLEEP) { + nbl->nbl_time = jiffies; + spin_lock(&nn->client_lock); + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + spin_unlock(&nn->client_lock); + } + err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); - switch (-err) { + switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; - case (EAGAIN): /* conflock holds conflicting lock */ + case FILE_LOCK_DEFERRED: + nbl = NULL; + /* Fallthrough */ + case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; - case (EDEADLK): + case -EDEADLK: status = nfserr_deadlock; break; default: @@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: + if (nbl) { + /* dequeue it if we queued it before */ + if (fl_flags & FL_SLEEP) { + spin_lock(&nn->client_lock); + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + spin_unlock(&nn->client_lock); + } + free_blocked_lock(nbl); + } if (filp) fput(filp); if (lock_stp) { @@ -5753,8 +5930,6 @@ out: if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (file_lock) - locks_free_lock(file_lock); if (conflock) locks_free_lock(conflock); return status; @@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + INIT_LIST_HEAD(&nn->blocked_locks_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0aa0236..c2d2895 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) } static __be32 +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +{ + DECODE_HEAD; + unsigned int tmp; + + status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8 + 4 + 4 + 4); + p = xdr_decode_hyper(p, ©->cp_src_pos); + p = xdr_decode_hyper(p, ©->cp_dst_pos); + p = xdr_decode_hyper(p, ©->cp_count); + copy->cp_consecutive = be32_to_cpup(p++); + copy->cp_synchronous = be32_to_cpup(p++); + tmp = be32_to_cpup(p); /* Source server list not supported */ + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { /* new operations for NFSv4.2 */ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, u32 starting_len = xdr->buf->len, needed_len; __be32 *p; - dprintk("%s: err %d\n", __func__, nfserr); + dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr)); if (nfserr) goto out; @@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +{ + __be32 *p; + + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(0); + p = xdr_encode_hyper(p, write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_stable_how); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return nfs_ok; +} + +static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + if (!nfserr) { + nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = cpu_to_be32(copy->cp_synchronous); + } + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.2 operations */ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 65ad016..36b2af9 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; + nn->clverifier_counter = prandom_u32(); + nn->clientid_counter = prandom_u32(); return 0; out_idmap_error: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e921476..010aff5 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -74,10 +74,10 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, * which only requires access, and "set-[ac]time-to-X" which * requires ownership. * So if it looks like it might be "set both to the same time which - * is close to now", and if inode_change_ok fails, then we + * is close to now", and if setattr_prepare fails, then we * convert to "set to now" instead of "set to explicit time" * - * We only call inode_change_ok as the last test as technically + * We only call setattr_prepare as the last test as technically * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) @@ -92,17 +92,15 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, * request is. We require it be within 30 minutes of now. */ time_t delta = iap->ia_atime.tv_sec - get_seconds(); - struct inode *inode; nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); if (nfserr) goto done; - inode = d_inode(fhp->fh_dentry); if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - inode_change_ok(inode, iap) != 0) { + setattr_prepare(fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times @@ -791,6 +789,7 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EUCLEAN }, }; int i; @@ -798,7 +797,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - WARN(1, "nfsd: non-standard errno: %d\n", errno); + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 45007ac..a2b65fc 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = { }; #endif +/* Only used under nfsd_mutex, so this atomic may be overkill: */ +static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier still has clients */ + if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); - register_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier is already set */ + if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { + register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - register_inet6addr_notifier(&nfsd_inet6addr_notifier); + register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 0c2a716..d27a5aa 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -19,6 +19,7 @@ struct nfsd4_deviceid_map { struct nfsd4_layout_ops { u32 notify_types; + bool disable_recalls; __be32 (*proc_getdeviceinfo)(struct super_block *sb, struct svc_rqst *rqstp, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b95adf9..c939936 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,7 +63,6 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - u32 cb_minorversion; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; @@ -441,11 +440,11 @@ struct nfs4_openowner { /* * Represents a generic "lockowner". Similar to an openowner. References to it * are held by the lock stateids that are created on its behalf. This object is - * a superset of the nfs4_stateowner struct (or would be if it needed any extra - * fields). + * a superset of the nfs4_stateowner struct. */ struct nfs4_lockowner { - struct nfs4_stateowner lo_owner; /* must be first element */ + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_blocked; /* blocked file_locks */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -572,6 +571,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, + NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; /* Returns true iff a is later than b: */ @@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) return (s32)(a->si_generation - b->si_generation) > 0; } +/* + * When a client tries to get a lock on a file, we set one of these objects + * on the blocking lock. When the lock becomes free, we can then issue a + * CB_NOTIFY_LOCK to the server. + */ +struct nfsd4_blocked_lock { + struct list_head nbl_list; + struct list_head nbl_lru; + unsigned long nbl_time; + struct file_lock nbl_lock; + struct knfsd_fh nbl_fh; + struct nfsd4_callback nbl_cb; +}; + struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ff476e6..8ca642f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, count)); } +ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + + /* + * Limit copy to 4MB to prevent indefinitely blocking an nfsd + * thread and client rpc slot. The choice of 4MB is somewhat + * arbitrary. We might instead base this on r/wsize, or make it + * tunable, or use a time instead of a byte limit, or implement + * asynchronous copy. In theory a client could also recognize a + * limit like this and pipeline multiple COPY requests. + */ + count = min_t(u64, count, 1 << 22); + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3cbb1b3..0bf9e7b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -96,6 +96,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); +ssize_t nfsd_copy_file_range(struct file *, u64, + struct file *, u64, u64); __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index beea0c5..8fda4ab 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -503,6 +503,28 @@ struct nfsd4_clone { u64 cl_count; }; +struct nfsd42_write_res { + u64 wr_bytes_written; + u32 wr_stable_how; + nfs4_verifier wr_verifier; +}; + +struct nfsd4_copy { + /* request */ + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + + /* both */ + bool cp_consecutive; + bool cp_synchronous; + + /* response */ + struct nfsd42_write_res cp_res; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -568,6 +590,7 @@ struct nfsd4_op { struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; + struct nfsd4_copy copy; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c47f6fd..49b719d 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -28,3 +28,12 @@ #define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 2 + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + enc_nfs4_fh_sz) +#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 908ebbf..582831a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -438,7 +438,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); } /* @@ -528,7 +528,7 @@ got_it: de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: @@ -576,7 +576,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); out: nilfs_put_page(page); return err; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index af04f55..c7f4fef 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -367,7 +367,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) atomic64_inc(&root->inodes_count); inode_init_owner(inode, dir, mode); inode->i_ino = ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); @@ -749,7 +749,7 @@ void nilfs_truncate(struct inode *inode) nilfs_truncate_bmap(ii, blkoff); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); @@ -829,7 +829,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) struct super_block *sb = inode->i_sb; int err; - err = inode_change_ok(inode, iattr); + err = setattr_prepare(dentry, iattr); if (err) return err; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f1d7989..1d2c3d7 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -174,7 +174,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, (flags & FS_FL_USER_MODIFIABLE); nilfs_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index dbcf1dc..2b71c60 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -194,7 +194,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); @@ -350,7 +350,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) } static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -361,6 +362,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct nilfs_transaction_info ti; int err; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); if (unlikely(err)) return err; @@ -391,7 +395,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); drop_nlink(new_inode); @@ -410,7 +414,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); nilfs_delete_entry(old_de, old_page); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a643138..7ebfca6 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -49,12 +49,12 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; * enough to fit in "count". Return an error pointer if the count * is not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -64,7 +64,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (FAN_EVENT_METADATA_LEN > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ return fsnotify_remove_first_event(group); } @@ -147,7 +147,7 @@ static struct fanotify_perm_event_info *dequeue_event( { struct fanotify_perm_event_info *event, *return_e = NULL; - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) @@ -157,7 +157,7 @@ static struct fanotify_perm_event_info *dequeue_event( return_e = event; break; } - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); pr_debug("%s: found return_re=%p\n", __func__, return_e); @@ -244,10 +244,10 @@ static unsigned int fanotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -268,9 +268,9 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); if (IS_ERR(kevent)) { ret = PTR_ERR(kevent); @@ -309,10 +309,10 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, wake_up(&group->fanotify_data.access_waitq); break; } - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_add_tail(&kevent->list, &group->fanotify_data.access_list); - spin_unlock(&group->fanotify_data.access_lock); + spin_unlock(&group->notification_lock); #endif } buf += ret; @@ -371,7 +371,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) * Process all permission events on access_list and notification queue * and simulate reply from userspace. */ - spin_lock(&group->fanotify_data.access_lock); + spin_lock(&group->notification_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -380,22 +380,22 @@ static int fanotify_release(struct inode *ignored, struct file *file) list_del_init(&event->fae.fse.list); event->response = FAN_ALLOW; } - spin_unlock(&group->fanotify_data.access_lock); /* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); - else + spin_lock(&group->notification_lock); + } else FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); @@ -421,10 +421,10 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } @@ -765,7 +765,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); #endif diff --git a/fs/notify/group.c b/fs/notify/group.c index b47f7cf..fbe3cbe 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -45,9 +45,9 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_group_stop_queueing(struct fsnotify_group *group) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); group->shutdown = true; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* @@ -125,7 +125,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); - mutex_init(&group->notification_mutex); + spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); init_waitqueue_head(&group->notification_waitq); group->max_events = UINT_MAX; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0..69d1ea3 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -115,10 +115,10 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) int ret = 0; poll_wait(file, &group->notification_waitq, wait); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } @@ -138,7 +138,7 @@ static int round_event_name_len(struct fsnotify_event *fsn_event) * enough to fit in "count". Return an error pointer if * not large enough. * - * Called with the group->notification_mutex held. + * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) @@ -157,7 +157,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_mutex the whole time, so this is the + /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); @@ -234,9 +234,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); kevent = get_one_event(group, count); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); @@ -300,13 +300,13 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } diff --git a/fs/notify/notification.c b/fs/notify/notification.c index e455e83..66f85c6 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list) ? true : false; } @@ -73,8 +73,17 @@ void fsnotify_destroy_event(struct fsnotify_group *group, /* Overflow events are per-group and we don't want to free them */ if (!event || event->mask == FS_Q_OVERFLOW) return; - /* If the event is still queued, we have a problem... */ - WARN_ON(!list_empty(&event->list)); + /* + * If the event is still queued, we have a problem... Do an unreliable + * lockless check first to avoid locking in the common case. The + * locking may be necessary for permission events which got removed + * from the list by a different CPU than the one freeing the event. + */ + if (!list_empty(&event->list)) { + spin_lock(&group->notification_lock); + WARN_ON(!list_empty(&event->list)); + spin_unlock(&group->notification_lock); + } group->ops->free_event(event); } @@ -95,10 +104,10 @@ int fsnotify_add_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); if (group->shutdown) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return 2; } @@ -106,7 +115,7 @@ int fsnotify_add_event(struct fsnotify_group *group, ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } event = group->overflow_event; @@ -116,7 +125,7 @@ int fsnotify_add_event(struct fsnotify_group *group, if (!list_empty(list) && merge) { ret = merge(list, event); if (ret) { - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); return ret; } } @@ -124,7 +133,7 @@ int fsnotify_add_event(struct fsnotify_group *group, queue: group->q_len++; list_add_tail(&event->list, list); - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); @@ -139,7 +148,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p\n", __func__, group); @@ -161,7 +170,7 @@ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - BUG_ON(!mutex_is_locked(&group->notification_mutex)); + assert_spin_locked(&group->notification_lock); return list_first_entry(&group->notification_list, struct fsnotify_event, list); @@ -175,12 +184,14 @@ void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - mutex_lock(&group->notification_mutex); + spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); + spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, event); + spin_lock(&group->notification_lock); } - mutex_unlock(&group->notification_mutex); + spin_unlock(&group->notification_lock); } /* @@ -5,11 +5,16 @@ #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> +#include <linux/user_namespace.h> +#include <linux/nsfs.h> static struct vfsmount *nsfs_mnt; +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); static const struct file_operations ns_file_operations = { .llseek = no_llseek, + .unlocked_ioctl = ns_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) @@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } -void *ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops) +static void *__ns_get_path(struct path *path, struct ns_common *ns) { - struct vfsmount *mnt = mntget(nsfs_mnt); + struct vfsmount *mnt = nsfs_mnt; struct qstr qname = { .name = "", }; struct dentry *dentry; struct inode *inode; - struct ns_common *ns; unsigned long d; -again: - ns = ns_ops->get(task); - if (!ns) { - mntput(mnt); - return ERR_PTR(-ENOENT); - } rcu_read_lock(); d = atomic_long_read(&ns->stashed); if (!d) @@ -68,21 +65,20 @@ again: if (!lockref_get_not_dead(&dentry->d_lockref)) goto slow; rcu_read_unlock(); - ns_ops->put(ns); + ns->ops->put(ns); got_it: - path->mnt = mnt; + path->mnt = mntget(mnt); path->dentry = dentry; return NULL; slow: rcu_read_unlock(); inode = new_inode_pseudo(mnt->mnt_sb); if (!inode) { - ns_ops->put(ns); - mntput(mnt); + ns->ops->put(ns); return ERR_PTR(-ENOMEM); } inode->i_ino = ns->inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; @@ -91,21 +87,96 @@ slow: dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); if (!dentry) { iput(inode); - mntput(mnt); return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); - dentry->d_fsdata = (void *)ns_ops; + dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { d_delete(dentry); /* make sure ->d_prune() does nothing */ dput(dentry); cpu_relax(); - goto again; + return ERR_PTR(-EAGAIN); } goto got_it; } +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + void *ret; + +again: + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + ret = __ns_get_path(path, ns); + if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN) + goto again; + return ret; +} + +static int open_related_ns(struct ns_common *ns, + struct ns_common *(*get_ns)(struct ns_common *ns)) +{ + struct path path = {}; + struct file *f; + void *err; + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + while (1) { + struct ns_common *relative; + + relative = get_ns(ns); + if (IS_ERR(relative)) { + put_unused_fd(fd); + return PTR_ERR(relative); + } + + err = __ns_get_path(&path, relative); + if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN) + continue; + break; + } + if (IS_ERR(err)) { + put_unused_fd(fd); + return PTR_ERR(err); + } + + f = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else + fd_install(fd, f); + + return fd; +} + +static long ns_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct ns_common *ns = get_proc_ns(file_inode(filp)); + + switch (ioctl) { + case NS_GET_USERNS: + return open_related_ns(ns, ns_get_owner); + case NS_GET_PARENT: + if (!ns->ops->get_parent) + return -EINVAL; + return open_related_ns(ns, ns->ops->get_parent); + default: + return -ENOTTY; + } +} + int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f548629..bf72a2c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1850,7 +1850,7 @@ again: * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) { + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index e01287c..7c410f8 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2813,7 +2813,7 @@ done: * for real. */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { - struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb); + struct timespec now = current_time(VFS_I(base_ni)); int sync_it = 0; if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || @@ -2893,7 +2893,7 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) int err; unsigned int ia_valid = attr->ia_valid; - err = inode_change_ok(vi, attr); + err = setattr_prepare(dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index d15d492..d3c0096 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2692,7 +2692,7 @@ mft_rec_already_initialized: /* Set the inode times to the current time. */ vi->i_atime = vi->i_mtime = vi->i_ctime = - current_fs_time(vi->i_sb); + current_time(vi); /* * Set the file size to 0, the ntfs inode sizes are set to 0 by * the call to ntfs_init_big_inode() below. diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 2162434..bed1fcb 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -201,7 +201,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); di->i_mode = cpu_to_le16(inode->i_mode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); @@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; + umode_t mode; - if (ret == 0) - acl = NULL; + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + return ret; ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f165f86..f72712f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7293,7 +7293,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 98d3654..c5c5b97 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1842,6 +1842,16 @@ out_commit: ocfs2_commit_trans(osb, handle); out: + /* + * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(), + * even in case of error here like ENOSPC and ENOMEM. So, we need + * to unlock the target page manually to prevent deadlocks when + * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED + * to VM code. + */ + if (wc->w_target_locked) + unlock_page(mmap_page); + ocfs2_free_write_ctxt(inode, wc); if (data_ac) { @@ -2020,7 +2030,7 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1d67fcb..8abab16 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2104,7 +2104,7 @@ int o2net_start_listening(struct o2nm_node *node) BUG_ON(o2net_listen_sock != NULL); mlog(ML_KTHREAD, "starting o2net thread...\n"); - o2net_wq = create_singlethread_workqueue("o2net"); + o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); if (o2net_wq == NULL) { mlog(ML_ERROR, "unable to launch o2net thread\n"); return -ENOMEM; /* ? */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index e1adf28..e7054e2 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1677,7 +1677,7 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -2990,7 +2990,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 533bd52..733e4e7 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1904,7 +1904,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) } snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); - dlm->dlm_worker = create_singlethread_workqueue(wq_name); + dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6ea06f8..3f828a1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3188,6 +3188,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, migrate->new_master, migrate->master); + if (ret < 0) + kmem_cache_free(dlm_mle_cache, mle); + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ef474cd..1079fae 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -211,7 +211,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -398,7 +398,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -421,7 +421,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(inode, parent, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; @@ -646,7 +646,7 @@ static int __init init_dlmfs_fs(void) } cleanup_inode = 1; - user_dlm_worker = create_singlethread_workqueue("user_dlm"); + user_dlm_worker = alloc_workqueue("user_dlm", WQ_MEM_RECLAIM, 0); if (!user_dlm_worker) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0b055bf..000c234 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -253,7 +253,7 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; } - now = CURRENT_TIME; + now = current_time(inode); if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) return 0; else @@ -287,7 +287,7 @@ int ocfs2_update_inode_atime(struct inode *inode, * have i_mutex to guard against concurrent changes to other * inode fields. */ - inode->i_atime = CURRENT_TIME; + inode->i_atime = current_time(inode); di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); @@ -308,7 +308,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -429,7 +429,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); @@ -840,7 +840,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; @@ -1155,7 +1155,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = inode_change_ok(inode, attr); + status = setattr_prepare(dentry, attr); if (status) return status; @@ -1950,7 +1950,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (change_size && i_size_read(inode) < size) i_size_write(inode, size); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_ctime = inode->i_mtime = current_time(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); @@ -2321,36 +2321,6 @@ out_mutex: return ret; } -static ssize_t ocfs2_file_splice_read(struct file *in, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - int ret = 0, lock_level = 0; - struct inode *inode = file_inode(in); - - trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - in->f_path.dentry->d_name.len, - in->f_path.dentry->d_name.name, len); - - /* - * See the comment in ocfs2_file_read_iter() - */ - ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level); - if (ret < 0) { - mlog_errno(ret); - goto bail; - } - ocfs2_inode_unlock(inode, lock_level); - - ret = generic_file_splice_read(in, ppos, pipe, len, flags); - -bail: - return ret; -} - static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -2474,10 +2444,7 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, @@ -2509,7 +2476,7 @@ const struct file_operations ocfs2_fops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; @@ -2554,7 +2521,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = ocfs2_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, }; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 50cc550..5af68fc 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -123,8 +123,6 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) #define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) #define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) -extern struct kmem_cache *ocfs2_inode_cache; - extern const struct address_space_operations ocfs2_aops; extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e3d05d9..4e8f32eb 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -953,7 +953,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index a8f1225..8d887c7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -798,7 +798,7 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); @@ -1000,7 +1000,7 @@ static int ocfs2_unlink(struct inode *dir, ocfs2_set_links_count(fe, inode->i_nlink); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1203,7 +1203,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) static int ocfs2_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, + unsigned int flags) { int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; @@ -1228,6 +1229,9 @@ static int ocfs2_rename(struct inode *old_dir, struct ocfs2_dir_lookup_result target_insert = { NULL, }; bool should_add_orphan = false; + if (flags) + return -EINVAL; + /* At some point it might be nice to break this function up a * bit. */ @@ -1537,7 +1541,7 @@ static int ocfs2_rename(struct inode *old_dir, new_dir_bh, &target_insert); } - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1586,9 +1590,9 @@ static int ocfs2_rename(struct inode *old_dir, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, @@ -2913,10 +2917,7 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, .set_acl = ocfs2_iop_set_acl, diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index f8f5fc5..0b58abc 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1314,8 +1314,6 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); - DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 92bbe93..1923851 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3778,7 +3778,7 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); @@ -4094,7 +4094,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = CURRENT_TIME; + t_inode->i_ctime = current_time(t_inode); di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 603b28d..f56fe39 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2329,7 +2329,7 @@ static int ocfs2_initialize_super(struct super_block *sb, } cleancache_init_shared_fs(sb); - osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 6c2a3e3..6ad8eec 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -91,9 +91,6 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, - .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 5bb44f7..cb157a3 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3431,7 +3431,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index c8cbf3b..b714652 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - dir->i_ctime = CURRENT_TIME_SEC; + dir->i_ctime = current_time(dir); /* mark affected inodes dirty to rebuild checksums */ mark_inode_dirty(dir); @@ -371,12 +371,16 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, } static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); int err; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + if (new_inode) { /* overwriting existing file/dir */ err = omfs_remove(new_dir, new_dentry); @@ -395,7 +399,7 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; - old_inode->i_ctime = CURRENT_TIME_SEC; + old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); out: return err; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d9e26cf..bf83e66 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -349,7 +349,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 3d935c8..df7ea85 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -49,7 +49,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; @@ -68,6 +68,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; + struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -90,7 +91,17 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - error = get_write_access(inode); + /* + * If this is an overlayfs then do as if opening the file so we get + * write access on the upper inode, not on the overlay inode. For + * non-overlay filesystems d_real() is an identity function. + */ + upperdentry = d_real(path->dentry, NULL, O_WRONLY); + error = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto mnt_drop_write_and_out; + + error = get_write_access(upperdentry->d_inode); if (error) goto mnt_drop_write_and_out; @@ -109,7 +120,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(inode); + put_write_access(upperdentry->d_inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: @@ -256,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; + /* Unshare range should only be used with allocate mode. */ + if ((mode & FALLOC_FL_UNSHARE_RANGE) && + (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; @@ -289,7 +305,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * Let individual file system decide if it supports preallocation * for directories or not. */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && + !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ @@ -726,7 +743,7 @@ static int do_dentry_open(struct file *f, if (error) goto cleanup_all; - error = break_lease(inode, f->f_flags); + error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c7a8699..c003a66 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -355,7 +355,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); if (inode->i_ino == OPENPROM_ROOT_INO) { inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 28f2195..7a37544 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -73,14 +73,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - /* - * can we represent this with the traditional file - * mode permission bits? - */ - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) { - gossip_err("%s: posix_acl_equiv_mode err: %d\n", + umode_t mode; + + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) { + gossip_err("%s: posix_acl_update_mode err: %d\n", __func__, error); return error; @@ -90,8 +87,6 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) SetModeFlag(orangefs_inode); inode->i_mode = mode; mark_inode_dirty_sync(inode); - if (error == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 00235bf..5355efb 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,7 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ret = 1; out_release_op: op_release(new_op); @@ -94,8 +94,9 @@ out_drop: static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) { int ret; + unsigned long time = (unsigned long) dentry->d_fsdata; - if (time_before(jiffies, dentry->d_time)) + if (time_before(jiffies, time)) return 1; if (flags & LOOKUP_RCU) diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index a287a66..516ffb4 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -11,14 +11,19 @@ #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" #include "orangefs-bufmap.h" +#include "orangefs-debugfs.h" #include <linux/debugfs.h> #include <linux/slab.h> /* this file implements the /dev/pvfs2-req device node */ +uint32_t orangefs_userspace_version; + static int open_access_count; +static DEFINE_MUTEX(devreq_mutex); + #define DUMP_DEVICE_ERROR() \ do { \ gossip_err("*****************************************************\n");\ @@ -43,7 +48,7 @@ static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) { int index = hash_func(op->tag, hash_table_size); - list_add_tail(&op->list, &htable_ops_in_progress[index]); + list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); } /* @@ -57,20 +62,20 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) index = hash_func(tag, hash_table_size); - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[index], + &orangefs_htable_ops_in_progress[index], list) { if (op->tag == tag && !op_state_purged(op) && !op_state_given_up(op)) { list_del_init(&op->list); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return op; } } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); return NULL; } @@ -276,11 +281,11 @@ restart: if (ret != 0) goto error; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); spin_lock(&cur_op->lock); if (unlikely(op_state_given_up(cur_op))) { spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); complete(&cur_op->waitq); goto restart; } @@ -298,7 +303,7 @@ restart: current->comm); orangefs_devreq_add_op(cur_op); spin_unlock(&cur_op->lock); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); /* The client only asks to read one size buffer. */ return MAX_DEV_REQ_UPSIZE; @@ -387,6 +392,13 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return -EPROTO; } + if (!orangefs_userspace_version) { + orangefs_userspace_version = head.version; + } else if (orangefs_userspace_version != head.version) { + gossip_err("Error: userspace version changes\n"); + return -EPROTO; + } + /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { @@ -527,6 +539,7 @@ static int orangefs_devreq_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: device close complete\n"); open_access_count = 0; + orangefs_userspace_version = 0; mutex_unlock(&devreq_mutex); return 0; } @@ -576,8 +589,6 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; struct ORANGEFS_dev_map_desc user_desc; int ret = 0; - struct dev_mask_info_s mask_info = { 0 }; - struct dev_mask2_info_s mask2_info = { 0, 0 }; int upstream_kmod = 1; struct orangefs_sb_info_s *orangefs_sb; @@ -619,7 +630,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) * all of the remounts are serviced (to avoid ops between * mounts to fail) */ - ret = mutex_lock_interruptible(&request_mutex); + ret = mutex_lock_interruptible(&orangefs_request_mutex); if (ret < 0) return ret; gossip_debug(GOSSIP_DEV_DEBUG, @@ -654,7 +665,7 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount complete\n", __func__); - mutex_unlock(&request_mutex); + mutex_unlock(&orangefs_request_mutex); return ret; case ORANGEFS_DEV_UPSTREAM: @@ -668,134 +679,11 @@ static long dispatch_ioctl_command(unsigned int command, unsigned long arg) return ret; case ORANGEFS_DEV_CLIENT_MASK: - ret = copy_from_user(&mask2_info, - (void __user *)arg, - sizeof(struct dev_mask2_info_s)); - - if (ret != 0) - return -EIO; - - client_debug_mask.mask1 = mask2_info.mask1_value; - client_debug_mask.mask2 = mask2_info.mask2_value; - - pr_info("%s: client debug mask has been been received " - ":%llx: :%llx:\n", - __func__, - (unsigned long long)client_debug_mask.mask1, - (unsigned long long)client_debug_mask.mask2); - - return ret; - + return orangefs_debugfs_new_client_mask((void __user *)arg); case ORANGEFS_DEV_CLIENT_STRING: - ret = copy_from_user(&client_debug_array_string, - (void __user *)arg, - ORANGEFS_MAX_DEBUG_STRING_LEN); - /* - * The real client-core makes an effort to ensure - * that actual strings that aren't too long to fit in - * this buffer is what we get here. We're going to use - * string functions on the stuff we got, so we'll make - * this extra effort to try and keep from - * flowing out of this buffer when we use the string - * functions, even if somehow the stuff we end up - * with here is garbage. - */ - client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = - '\0'; - - if (ret != 0) { - pr_info("%s: CLIENT_STRING: copy_from_user failed\n", - __func__); - return -EIO; - } - - pr_info("%s: client debug array string has been received.\n", - __func__); - - if (!help_string_initialized) { - - /* Free the "we don't know yet" default string... */ - kfree(debug_help_string); - - /* build a proper debug help string */ - if (orangefs_prepare_debugfs_help_string(0)) { - gossip_err("%s: no debug help string \n", - __func__); - return -EIO; - } - - /* Replace the boilerplate boot-time debug-help file. */ - debugfs_remove(help_file_dentry); - - help_file_dentry = - debugfs_create_file( - ORANGEFS_KMOD_DEBUG_HELP_FILE, - 0444, - debug_dir, - debug_help_string, - &debug_help_fops); - - if (!help_file_dentry) { - gossip_err("%s: debugfs_create_file failed for" - " :%s:!\n", - __func__, - ORANGEFS_KMOD_DEBUG_HELP_FILE); - return -EIO; - } - } - - debug_mask_to_string(&client_debug_mask, 1); - - debugfs_remove(client_debug_dentry); - - orangefs_client_debug_init(); - - help_string_initialized++; - - return ret; - + return orangefs_debugfs_new_client_string((void __user *)arg); case ORANGEFS_DEV_DEBUG: - ret = copy_from_user(&mask_info, - (void __user *)arg, - sizeof(mask_info)); - - if (ret != 0) - return -EIO; - - if (mask_info.mask_type == KERNEL_MASK) { - if ((mask_info.mask_value == 0) - && (kernel_mask_set_mod_init)) { - /* - * the kernel debug mask was set when the - * kernel module was loaded; don't override - * it if the client-core was started without - * a value for ORANGEFS_KMODMASK. - */ - return 0; - } - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - gossip_debug_mask = mask_info.mask_value; - pr_info("%s: kernel debug mask has been modified to " - ":%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - } else if (mask_info.mask_type == CLIENT_MASK) { - debug_mask_to_string(&mask_info.mask_value, - mask_info.mask_type); - pr_info("%s: client debug mask has been modified to" - ":%s: :%llx:\n", - __func__, - client_debug_string, - llu(mask_info.mask_value)); - } else { - gossip_lerr("Invalid mask type....\n"); - return -EINVAL; - } - - return ret; - + return orangefs_debugfs_new_debug((void __user *)arg); default: return -ENOIOCTLCMD; } diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 324f0af..284373a 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -177,8 +177,8 @@ static int orangefs_readdir(struct file *file, struct dir_context *ctx) } gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %s (pos=%llu)\n", - dentry->d_name.name, llu(pos)); + "orangefs_readdir called on %pd (pos=%llu)\n", + dentry, llu(pos)); memset(&readdir_response, 0, sizeof(readdir_response)); diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 66b9921..3b8923f 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -83,7 +83,10 @@ struct orangefs_listxattr_response { }; struct orangefs_param_response { - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; }; #define PERF_COUNT_BUF_SIZE 4096 @@ -98,6 +101,11 @@ struct orangefs_fs_key_response { char fs_key[FS_KEY_BUF_SIZE]; }; +/* 2.9.6 */ +struct orangefs_features_response { + __u64 features; +}; + struct orangefs_downcall_s { __s32 type; __s32 status; @@ -119,6 +127,7 @@ struct orangefs_downcall_s { struct orangefs_param_response param; struct orangefs_perf_count_response perf_count; struct orangefs_fs_key_response fs_key; + struct orangefs_features_response features; } resp; }; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 526040e..02cc613 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -14,6 +14,32 @@ #include <linux/fs.h> #include <linux/pagemap.h> +static int flush_racache(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_kernel_op_s *new_op; + int ret; + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: %pU: Handle is %pU | fs_id %d\n", __func__, + get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, + orangefs_inode->refn.fs_id); + + new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; + + ret = service_operation(new_op, "orangefs_flush_racache", + get_interruptible_flag(inode)); + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", + __func__, ret); + + op_release(new_op); + return ret; +} + /* * Copy to client-core's address space from the buffers specified * by the iovec upto total_size bytes. @@ -358,7 +384,7 @@ out: file_accessed(file); } else { SetMtimeFlag(orangefs_inode); - inode->i_mtime = CURRENT_TIME; + inode->i_mtime = current_time(inode); mark_inode_dirty_sync(inode); } } @@ -386,7 +412,7 @@ ssize_t orangefs_inode_read(struct inode *inode, size_t bufmap_size; ssize_t ret = -EINVAL; - g_orangefs_stats.reads++; + orangefs_stats.reads++; bufmap_size = orangefs_bufmap_size_query(); if (count > bufmap_size) { @@ -427,7 +453,7 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - g_orangefs_stats.reads++; + orangefs_stats.reads++; rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); iocb->ki_pos = pos; @@ -488,7 +514,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } iocb->ki_pos = pos; - g_orangefs_stats.writes++; + orangefs_stats.writes++; out: @@ -585,21 +611,30 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) static int orangefs_file_release(struct inode *inode, struct file *file) { gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_file_release: called on %s\n", - file->f_path.dentry->d_name.name); + "orangefs_file_release: called on %pD\n", + file); orangefs_flush_inode(inode); /* - * remove all associated inode pages from the page cache and mmap + * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of * data for the next caller of mmap (or 'get_block' accesses) */ - if (file->f_path.dentry->d_inode && - file->f_path.dentry->d_inode->i_mapping && - mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) - truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping, + if (file_inode(file) && + file_inode(file)->i_mapping && + mapping_nrpages(&file_inode(file)->i_data)) { + if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { + gossip_debug(GOSSIP_INODE_DEBUG, + "calling flush_racache on %pU\n", + get_khandle_from_ino(inode)); + flush_racache(inode); + gossip_debug(GOSSIP_INODE_DEBUG, + "flush_racache finished\n"); + } + truncate_inode_pages(file_inode(file)->i_mapping, 0); + } return 0; } @@ -613,7 +648,7 @@ static int orangefs_fsync(struct file *file, { int ret = -EINVAL; struct orangefs_inode_s *orangefs_inode = - ORANGEFS_I(file->f_path.dentry->d_inode); + ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; /* required call */ @@ -626,7 +661,7 @@ static int orangefs_fsync(struct file *file, ret = service_operation(new_op, "orangefs_fsync", - get_interruptible_flag(file->f_path.dentry->d_inode)); + get_interruptible_flag(file_inode(file))); gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_fsync got return value of %d\n", @@ -634,7 +669,7 @@ static int orangefs_fsync(struct file *file, op_release(new_op); - orangefs_flush_inode(file->f_path.dentry->d_inode); + orangefs_flush_inode(file_inode(file)); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 28a0557..ef3b4eb 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -129,8 +129,8 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_direct_IO: %s\n", - iocb->ki_filp->f_path.dentry->d_name.name); + "orangefs_direct_IO: %pD\n", + iocb->ki_filp); return -EINVAL; } @@ -216,10 +216,10 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: called on %s\n", - dentry->d_name.name); + "orangefs_setattr: called on %pd\n", + dentry); - ret = inode_change_ok(inode, iattr); + ret = setattr_prepare(dentry, iattr); if (ret) goto out; @@ -259,8 +259,8 @@ int orangefs_getattr(struct vfsmount *mnt, struct orangefs_inode_s *orangefs_inode = NULL; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_getattr: called on %s\n", - dentry->d_name.name); + "orangefs_getattr: called on %pd\n", + dentry); ret = orangefs_inode_getattr(inode, 0, 0); if (ret == 0) { @@ -296,10 +296,7 @@ const struct inode_operations orangefs_file_inode_operations = { .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, .getattr = orangefs_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = orangefs_listxattr, - .removexattr = generic_removexattr, .permission = orangefs_permission, }; @@ -438,7 +435,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_size = PAGE_SIZE; inode->i_rdev = dev; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 62c5259..a290ff6 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -24,9 +24,9 @@ static int orangefs_create(struct inode *dir, struct inode *inode; int ret; - gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n", + gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", __func__, - dentry->d_name.name); + dentry); new_op = op_alloc(ORANGEFS_VFS_OP_CREATE); if (!new_op) @@ -43,9 +43,9 @@ static int orangefs_create(struct inode *dir, ret = service_operation(new_op, __func__, get_interruptible_flag(dir)); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", + "%s: %pd: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n", __func__, - dentry->d_name.name, + dentry, &new_op->downcall.resp.create.refn.khandle, new_op->downcall.resp.create.refn.fs_id, new_op, @@ -57,39 +57,39 @@ static int orangefs_create(struct inode *dir, inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &new_op->downcall.resp.create.refn); if (IS_ERR(inode)) { - gossip_err("%s: Failed to allocate inode for file :%s:\n", + gossip_err("%s: Failed to allocate inode for file :%pd:\n", __func__, - dentry->d_name.name); + dentry); ret = PTR_ERR(inode); goto out; } gossip_debug(GOSSIP_NAME_DEBUG, - "%s: Assigned inode :%pU: for file :%s:\n", + "%s: Assigned inode :%pU: for file :%pd:\n", __func__, get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: dentry instantiated for %s\n", + "%s: dentry instantiated for %pd\n", __func__, - dentry->d_name.name); + dentry); SetMtimeFlag(parent); - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty_sync(dir); ret = 0; out: op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, - "%s: %s: returning %d\n", + "%s: %pd: returning %d\n", __func__, - dentry->d_name.name, + dentry, ret); return ret; } @@ -115,8 +115,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, * -EEXIST on O_EXCL opens, which is broken if we skip this lookup * in the create path) */ - gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n", - __func__, dentry->d_name.name); + gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %pd\n", + __func__, dentry); if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1)) return ERR_PTR(-ENAMETOOLONG); @@ -169,9 +169,9 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, gossip_debug(GOSSIP_NAME_DEBUG, "orangefs_lookup: Adding *negative* dentry " - "%p for %s\n", + "%p for %pd\n", dentry, - dentry->d_name.name); + dentry); d_add(dentry, NULL); res = NULL; @@ -183,7 +183,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { @@ -224,10 +224,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) int ret; gossip_debug(GOSSIP_NAME_DEBUG, - "%s: called on %s\n" + "%s: called on %pd\n" " (inode %pU): Parent is %pU | fs_id %d\n", __func__, - dentry->d_name.name, + dentry, get_khandle_from_ino(inode), &parent->refn.khandle, parent->refn.fs_id); @@ -254,7 +254,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) drop_nlink(inode); SetMtimeFlag(parent); - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty_sync(dir); } return ret; @@ -322,16 +322,16 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Symlink) %pU -> %s\n", + "Inode (Symlink) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); SetMtimeFlag(parent); - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty_sync(dir); ret = 0; out: @@ -386,20 +386,20 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); - dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, - "Inode (Directory) %pU -> %s\n", + "Inode (Directory) %pU -> %pd\n", get_khandle_from_ino(inode), - dentry->d_name.name); + dentry); /* * NOTE: we have no good way to keep nlink consistent for directories * across clients; keep constant at 1. */ SetMtimeFlag(parent); - dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty_sync(dir); out: op_release(new_op); @@ -409,11 +409,15 @@ out: static int orangefs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry) + struct dentry *new_dentry, + unsigned int flags) { struct orangefs_kernel_op_s *new_op; int ret; + if (flags) + return -EINVAL; + gossip_debug(GOSSIP_NAME_DEBUG, "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", old_dentry, new_dentry, d_count(new_dentry)); @@ -443,7 +447,7 @@ static int orangefs_rename(struct inode *old_dir, ret); if (new_dentry->d_inode) - new_dentry->d_inode->i_ctime = CURRENT_TIME; + new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode); op_release(new_op); return ret; @@ -462,9 +466,6 @@ const struct inode_operations orangefs_dir_inode_operations = { .rename = orangefs_rename, .setattr = orangefs_setattr, .getattr = orangefs_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, }; diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index b6edbe9..aa3830b 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; - else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH) - return "OP_MMAP_RA_FLUSH"; + else if (type == ORANGEFS_VFS_OP_RA_FLUSH) + return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) @@ -97,6 +97,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; + else if (type == ORANGEFS_VFS_OP_FEATURES) + return "OP_FEATURES"; } return "OP_UNKNOWN?"; } diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1714a73..eb09aa0 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -43,36 +43,35 @@ #include "protocol.h" #include "orangefs-kernel.h" -static int orangefs_debug_disabled = 1; - -static int orangefs_debug_help_open(struct inode *, struct file *); +#define DEBUG_HELP_STRING_SIZE 4096 +#define HELP_STRING_UNINITIALIZED \ + "Client Debug Keywords are unknown until the first time\n" \ + "the client is started after boot.\n" +#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" +#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" +#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" +#define ORANGEFS_VERBOSE "verbose" +#define ORANGEFS_ALL "all" -const struct file_operations debug_help_fops = { - .open = orangefs_debug_help_open, - .read = seq_read, - .release = seq_release, - .llseek = seq_lseek, +/* + * An array of client_debug_mask will be built to hold debug keyword/mask + * values fetched from userspace. + */ +struct client_debug_mask { + char *keyword; + __u64 mask1; + __u64 mask2; }; +static int orangefs_kernel_debug_init(void); + +static int orangefs_debug_help_open(struct inode *, struct file *); static void *help_start(struct seq_file *, loff_t *); static void *help_next(struct seq_file *, void *, loff_t *); static void help_stop(struct seq_file *, void *); static int help_show(struct seq_file *, void *); -static const struct seq_operations help_debug_ops = { - .start = help_start, - .next = help_next, - .stop = help_stop, - .show = help_show, -}; - -/* - * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and - * ORANGEFS_KMOD_DEBUG_FILE. - */ -static DEFINE_MUTEX(orangefs_debug_lock); - -int orangefs_debug_open(struct inode *, struct file *); +static int orangefs_debug_open(struct inode *, struct file *); static ssize_t orangefs_debug_read(struct file *, char __user *, @@ -84,6 +83,43 @@ static ssize_t orangefs_debug_write(struct file *, size_t, loff_t *); +static int orangefs_prepare_cdm_array(char *); +static void debug_mask_to_string(void *, int); +static void do_k_string(void *, int); +static void do_c_string(void *, int); +static int keyword_is_amalgam(char *); +static int check_amalgam_keyword(void *, int); +static void debug_string_to_mask(char *, void *, int); +static void do_c_mask(int, char *, struct client_debug_mask **); +static void do_k_mask(int, char *, __u64 **); + +static char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; +static char *debug_help_string; +static char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; +static char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; + +static struct dentry *help_file_dentry; +static struct dentry *client_debug_dentry; +static struct dentry *debug_dir; + +static unsigned int kernel_mask_set_mod_init; +static int orangefs_debug_disabled = 1; +static int help_string_initialized; + +static const struct seq_operations help_debug_ops = { + .start = help_start, + .next = help_next, + .stop = help_stop, + .show = help_show, +}; + +const struct file_operations debug_help_fops = { + .open = orangefs_debug_help_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + static const struct file_operations kernel_debug_fops = { .open = orangefs_debug_open, .read = orangefs_debug_read, @@ -91,15 +127,55 @@ static const struct file_operations kernel_debug_fops = { .llseek = generic_file_llseek, }; +static int client_all_index; +static int client_verbose_index; + +static struct client_debug_mask *cdm_array; +static int cdm_element_count; + +static struct client_debug_mask client_debug_mask; + +/* + * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and + * ORANGEFS_KMOD_DEBUG_FILE. + */ +static DEFINE_MUTEX(orangefs_debug_lock); + /* * initialize kmod debug operations, create orangefs debugfs dir and * ORANGEFS_KMOD_DEBUG_HELP_FILE. */ -int orangefs_debugfs_init(void) +int orangefs_debugfs_init(int debug_mask) { - int rc = -ENOMEM; + /* convert input debug mask to a 64-bit unsigned integer */ + orangefs_gossip_debug_mask = (unsigned long long)debug_mask; + + /* + * set the kernel's gossip debug string; invalid mask values will + * be ignored. + */ + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); + + /* remove any invalid values from the mask */ + debug_string_to_mask(kernel_debug_string, &orangefs_gossip_debug_mask, + 0); + + /* + * if the mask has a non-zero value, then indicate that the mask + * was set when the kernel module was loaded. The orangefs dev ioctl + * command will look at this boolean to determine if the kernel's + * debug mask should be overwritten when the client-core is started. + */ + if (orangefs_gossip_debug_mask != 0) + kernel_mask_set_mod_init = true; + + pr_info("%s: called with debug mask: :%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + debug_dir = debugfs_create_dir("orangefs", NULL); if (!debug_dir) { pr_info("%s: debugfs_create_dir failed.\n", __func__); @@ -117,13 +193,58 @@ int orangefs_debugfs_init(void) } orangefs_debug_disabled = 0; + + rc = orangefs_kernel_debug_init(); + +out: + + return rc; +} + +/* + * initialize the kernel-debug file. + */ +static int orangefs_kernel_debug_init(void) +{ + int rc = -ENOMEM; + struct dentry *ret; + char *k_buffer = NULL; + + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); + + k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); + if (!k_buffer) + goto out; + + if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { + strcpy(k_buffer, kernel_debug_string); + strcat(k_buffer, "\n"); + } else { + strcpy(k_buffer, "none\n"); + pr_info("%s: overflow 1!\n", __func__); + } + + ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, + 0444, + debug_dir, + k_buffer, + &kernel_debug_fops); + if (!ret) { + pr_info("%s: failed to create %s.\n", + __func__, + ORANGEFS_KMOD_DEBUG_FILE); + goto out; + } + rc = 0; out: + gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); return rc; } + void orangefs_debugfs_cleanup(void) { debugfs_remove_recursive(debug_dir); @@ -196,49 +317,6 @@ static int help_show(struct seq_file *m, void *v) } /* - * initialize the kernel-debug file. - */ -int orangefs_kernel_debug_init(void) -{ - int rc = -ENOMEM; - struct dentry *ret; - char *k_buffer = NULL; - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__); - - k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!k_buffer) - goto out; - - if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) { - strcpy(k_buffer, kernel_debug_string); - strcat(k_buffer, "\n"); - } else { - strcpy(k_buffer, "none\n"); - pr_info("%s: overflow 1!\n", __func__); - } - - ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, - 0444, - debug_dir, - k_buffer, - &kernel_debug_fops); - if (!ret) { - pr_info("%s: failed to create %s.\n", - __func__, - ORANGEFS_KMOD_DEBUG_FILE); - goto out; - } - - rc = 0; - -out: - - gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc); - return rc; -} - -/* * initialize the client-debug file. */ int orangefs_client_debug_init(void) @@ -282,7 +360,7 @@ out: } /* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/ -int orangefs_debug_open(struct inode *inode, struct file *file) +static int orangefs_debug_open(struct inode *inode, struct file *file) { int rc = -ENODEV; @@ -350,8 +428,8 @@ static ssize_t orangefs_debug_write(struct file *file, struct client_debug_mask c_mask = { NULL, 0, 0 }; gossip_debug(GOSSIP_DEBUGFS_DEBUG, - "orangefs_debug_write: %s\n", - file->f_path.dentry->d_name.name); + "orangefs_debug_write: %pD\n", + file); /* * Thwart users who try to jamb a ridiculous number @@ -384,8 +462,8 @@ static ssize_t orangefs_debug_write(struct file *file, */ if (!strcmp(file->f_path.dentry->d_name.name, ORANGEFS_KMOD_DEBUG_FILE)) { - debug_string_to_mask(buf, &gossip_debug_mask, 0); - debug_mask_to_string(&gossip_debug_mask, 0); + debug_string_to_mask(buf, &orangefs_gossip_debug_mask, 0); + debug_mask_to_string(&orangefs_gossip_debug_mask, 0); debug_string = kernel_debug_string; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "New kernel debug string is %s\n", @@ -452,3 +530,546 @@ out: kfree(buf); return rc; } + +/* + * After obtaining a string representation of the client's debug + * keywords and their associated masks, this function is called to build an + * array of these values. + */ +static int orangefs_prepare_cdm_array(char *debug_array_string) +{ + int i; + int rc = -EINVAL; + char *cds_head = NULL; + char *cds_delimiter = NULL; + int keyword_len = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + /* + * figure out how many elements the cdm_array needs. + */ + for (i = 0; i < strlen(debug_array_string); i++) + if (debug_array_string[i] == '\n') + cdm_element_count++; + + if (!cdm_element_count) { + pr_info("No elements in client debug array string!\n"); + goto out; + } + + cdm_array = + kzalloc(cdm_element_count * sizeof(struct client_debug_mask), + GFP_KERNEL); + if (!cdm_array) { + pr_info("malloc failed for cdm_array!\n"); + rc = -ENOMEM; + goto out; + } + + cds_head = debug_array_string; + + for (i = 0; i < cdm_element_count; i++) { + cds_delimiter = strchr(cds_head, '\n'); + *cds_delimiter = '\0'; + + keyword_len = strcspn(cds_head, " "); + + cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); + if (!cdm_array[i].keyword) { + rc = -ENOMEM; + goto out; + } + + sscanf(cds_head, + "%s %llx %llx", + cdm_array[i].keyword, + (unsigned long long *)&(cdm_array[i].mask1), + (unsigned long long *)&(cdm_array[i].mask2)); + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) + client_verbose_index = i; + + if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) + client_all_index = i; + + cds_head = cds_delimiter + 1; + } + + rc = cdm_element_count; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); + +out: + + return rc; + +} + +/* + * /sys/kernel/debug/orangefs/debug-help can be catted to + * see all the available kernel and client debug keywords. + * + * When the kernel boots, we have no idea what keywords the + * client supports, nor their associated masks. + * + * We pass through this function once at boot and stamp a + * boilerplate "we don't know" message for the client in the + * debug-help file. We pass through here again when the client + * starts and then we can fill out the debug-help file fully. + * + * The client might be restarted any number of times between + * reboots, we only build the debug-help file the first time. + */ +int orangefs_prepare_debugfs_help_string(int at_boot) +{ + int rc = -EINVAL; + int i; + int byte_count = 0; + char *client_title = "Client Debug Keywords:\n"; + char *kernel_title = "Kernel Debug Keywords:\n"; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (at_boot) { + byte_count += strlen(HELP_STRING_UNINITIALIZED); + client_title = HELP_STRING_UNINITIALIZED; + } else { + /* + * fill the client keyword/mask array and remember + * how many elements there were. + */ + cdm_element_count = + orangefs_prepare_cdm_array(client_debug_array_string); + if (cdm_element_count <= 0) + goto out; + + /* Count the bytes destined for debug_help_string. */ + byte_count += strlen(client_title); + + for (i = 0; i < cdm_element_count; i++) { + byte_count += strlen(cdm_array[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 1!\n", __func__); + goto out; + } + } + + gossip_debug(GOSSIP_UTILS_DEBUG, + "%s: cdm_element_count:%d:\n", + __func__, + cdm_element_count); + } + + byte_count += strlen(kernel_title); + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + byte_count += + strlen(s_kmod_keyword_mask_map[i].keyword + 2); + if (byte_count >= DEBUG_HELP_STRING_SIZE) { + pr_info("%s: overflow 2!\n", __func__); + goto out; + } + } + + /* build debug_help_string. */ + debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); + if (!debug_help_string) { + rc = -ENOMEM; + goto out; + } + + strcat(debug_help_string, client_title); + + if (!at_boot) { + for (i = 0; i < cdm_element_count; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, cdm_array[i].keyword); + strcat(debug_help_string, "\n"); + } + } + + strcat(debug_help_string, "\n"); + strcat(debug_help_string, kernel_title); + + for (i = 0; i < num_kmod_keyword_mask_map; i++) { + strcat(debug_help_string, "\t"); + strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword); + strcat(debug_help_string, "\n"); + } + + rc = 0; + +out: + + return rc; + +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_mask_to_string(void *mask, int type) +{ + int i; + int len = 0; + char *debug_string; + int element_count = 0; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + debug_string = client_debug_string; + element_count = cdm_element_count; + } else { + debug_string = kernel_debug_string; + element_count = num_kmod_keyword_mask_map; + } + + memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + + /* + * Some keywords, like "all" or "verbose", are amalgams of + * numerous other keywords. Make a special check for those + * before grinding through the whole mask only to find out + * later... + */ + if (check_amalgam_keyword(mask, type)) + goto out; + + /* Build the debug string. */ + for (i = 0; i < element_count; i++) + if (type) + do_c_string(mask, i); + else + do_k_string(mask, i); + + len = strlen(debug_string); + + if ((len) && (type)) + client_debug_string[len - 1] = '\0'; + else if (len) + kernel_debug_string[len - 1] = '\0'; + else if (type) + strcpy(client_debug_string, "none"); + else + strcpy(kernel_debug_string, "none"); + +out: +gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); + + return; + +} + +static void do_k_string(void *k_mask, int index) +{ + __u64 *mask = (__u64 *) k_mask; + + if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) + goto out; + + if (*mask & s_kmod_keyword_mask_map[index].mask_val) { + if ((strlen(kernel_debug_string) + + strlen(s_kmod_keyword_mask_map[index].keyword)) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { + strcat(kernel_debug_string, + s_kmod_keyword_mask_map[index].keyword); + strcat(kernel_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(kernel_debug_string, ORANGEFS_ALL); + goto out; + } + } + +out: + + return; +} + +static void do_c_string(void *c_mask, int index) +{ + struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; + + if (keyword_is_amalgam(cdm_array[index].keyword)) + goto out; + + if ((mask->mask1 & cdm_array[index].mask1) || + (mask->mask2 & cdm_array[index].mask2)) { + if ((strlen(client_debug_string) + + strlen(cdm_array[index].keyword) + 1) + < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { + strcat(client_debug_string, + cdm_array[index].keyword); + strcat(client_debug_string, ","); + } else { + gossip_err("%s: overflow!\n", __func__); + strcpy(client_debug_string, ORANGEFS_ALL); + goto out; + } + } +out: + return; +} + +static int keyword_is_amalgam(char *keyword) +{ + int rc = 0; + + if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) + rc = 1; + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + * + * return 1 if we found an amalgam. + */ +static int check_amalgam_keyword(void *mask, int type) +{ + __u64 *k_mask; + struct client_debug_mask *c_mask; + int k_all_index = num_kmod_keyword_mask_map - 1; + int rc = 0; + + if (type) { + c_mask = (struct client_debug_mask *) mask; + + if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && + (c_mask->mask2 == cdm_array[client_all_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + + if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && + (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { + strcpy(client_debug_string, ORANGEFS_VERBOSE); + rc = 1; + goto out; + } + + } else { + k_mask = (__u64 *) mask; + + if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { + strcpy(kernel_debug_string, ORANGEFS_ALL); + rc = 1; + goto out; + } + } + +out: + + return rc; +} + +/* + * kernel = type 0 + * client = type 1 + */ +static void debug_string_to_mask(char *debug_string, void *mask, int type) +{ + char *unchecked_keyword; + int i; + char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); + char *original_pointer; + int element_count = 0; + struct client_debug_mask *c_mask = NULL; + __u64 *k_mask = NULL; + + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); + + if (type) { + c_mask = (struct client_debug_mask *)mask; + element_count = cdm_element_count; + } else { + k_mask = (__u64 *)mask; + *k_mask = 0; + element_count = num_kmod_keyword_mask_map; + } + + original_pointer = strsep_fodder; + while ((unchecked_keyword = strsep(&strsep_fodder, ","))) + if (strlen(unchecked_keyword)) { + for (i = 0; i < element_count; i++) + if (type) + do_c_mask(i, + unchecked_keyword, + &c_mask); + else + do_k_mask(i, + unchecked_keyword, + &k_mask); + } + + kfree(original_pointer); +} + +static void do_c_mask(int i, char *unchecked_keyword, + struct client_debug_mask **sane_mask) +{ + + if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { + (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; + (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; + } +} + +static void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) +{ + + if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) + **sane_mask = (**sane_mask) | + s_kmod_keyword_mask_map[i].mask_val; +} + +int orangefs_debugfs_new_client_mask(void __user *arg) +{ + struct dev_mask2_info_s mask2_info = {0}; + int ret; + + ret = copy_from_user(&mask2_info, + (void __user *)arg, + sizeof(struct dev_mask2_info_s)); + + if (ret != 0) + return -EIO; + + client_debug_mask.mask1 = mask2_info.mask1_value; + client_debug_mask.mask2 = mask2_info.mask2_value; + + pr_info("%s: client debug mask has been been received " + ":%llx: :%llx:\n", + __func__, + (unsigned long long)client_debug_mask.mask1, + (unsigned long long)client_debug_mask.mask2); + + return ret; +} + +int orangefs_debugfs_new_client_string(void __user *arg) +{ + int ret; + + ret = copy_from_user(&client_debug_array_string, + (void __user *)arg, + ORANGEFS_MAX_DEBUG_STRING_LEN); + if (ret != 0) + return -EIO; + + /* + * The real client-core makes an effort to ensure + * that actual strings that aren't too long to fit in + * this buffer is what we get here. We're going to use + * string functions on the stuff we got, so we'll make + * this extra effort to try and keep from + * flowing out of this buffer when we use the string + * functions, even if somehow the stuff we end up + * with here is garbage. + */ + client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] = + '\0'; + + if (ret != 0) { + pr_info("%s: CLIENT_STRING: copy_from_user failed\n", + __func__); + return -EIO; + } + + pr_info("%s: client debug array string has been received.\n", + __func__); + + if (!help_string_initialized) { + + /* Free the "we don't know yet" default string... */ + kfree(debug_help_string); + + /* build a proper debug help string */ + if (orangefs_prepare_debugfs_help_string(0)) { + gossip_err("%s: no debug help string \n", + __func__); + return -EIO; + } + + /* Replace the boilerplate boot-time debug-help file. */ + debugfs_remove(help_file_dentry); + + help_file_dentry = + debugfs_create_file( + ORANGEFS_KMOD_DEBUG_HELP_FILE, + 0444, + debug_dir, + debug_help_string, + &debug_help_fops); + + if (!help_file_dentry) { + gossip_err("%s: debugfs_create_file failed for" + " :%s:!\n", + __func__, + ORANGEFS_KMOD_DEBUG_HELP_FILE); + return -EIO; + } + } + + debug_mask_to_string(&client_debug_mask, 1); + + debugfs_remove(client_debug_dentry); + + orangefs_client_debug_init(); + + help_string_initialized++; + + return ret; +} + +int orangefs_debugfs_new_debug(void __user *arg) +{ + struct dev_mask_info_s mask_info = {0}; + int ret; + + ret = copy_from_user(&mask_info, + (void __user *)arg, + sizeof(mask_info)); + + if (ret != 0) + return -EIO; + + if (mask_info.mask_type == KERNEL_MASK) { + if ((mask_info.mask_value == 0) + && (kernel_mask_set_mod_init)) { + /* + * the kernel debug mask was set when the + * kernel module was loaded; don't override + * it if the client-core was started without + * a value for ORANGEFS_KMODMASK. + */ + return 0; + } + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + orangefs_gossip_debug_mask = mask_info.mask_value; + pr_info("%s: kernel debug mask has been modified to " + ":%s: :%llx:\n", + __func__, + kernel_debug_string, + (unsigned long long)orangefs_gossip_debug_mask); + } else if (mask_info.mask_type == CLIENT_MASK) { + debug_mask_to_string(&mask_info.mask_value, + mask_info.mask_type); + pr_info("%s: client debug mask has been modified to" + ":%s: :%llx:\n", + __func__, + client_debug_string, + llu(mask_info.mask_value)); + } else { + gossip_lerr("Invalid mask type....\n"); + return -EINVAL; + } + + return ret; +} diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h index e4828c0..8035172 100644 --- a/fs/orangefs/orangefs-debugfs.h +++ b/fs/orangefs/orangefs-debugfs.h @@ -1,3 +1,7 @@ -int orangefs_debugfs_init(void); -int orangefs_kernel_debug_init(void); +int orangefs_debugfs_init(int); void orangefs_debugfs_cleanup(void); +int orangefs_client_debug_init(void); +int orangefs_prepare_debugfs_help_string(int); +int orangefs_debugfs_new_client_mask(void __user *); +int orangefs_debugfs_new_client_string(void __user *); +int orangefs_debugfs_new_debug(void __user *); diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index 9eac9d9..a3d84ff 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -28,7 +28,7 @@ #define ORANGEFS_VFS_OP_RENAME 0xFF00000A #define ORANGEFS_VFS_OP_STATFS 0xFF00000B #define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C -#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D +#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D #define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E #define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F #define ORANGEFS_VFS_OP_GETXATTR 0xFF000010 @@ -41,6 +41,10 @@ #define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01 #define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02 #define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03 +#define ORANGEFS_VFS_OP_FEATURES 0xFF00EE05 /* 2.9.6 */ + +/* features is a 64-bit unsigned bitmask */ +#define ORANGEFS_FEATURE_READAHEAD 1 /* * Misc constants. Please retain them as multiples of 8! diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 633c07a..3bf803d 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -100,16 +100,6 @@ enum orangefs_vfs_op_states { }; /* - * An array of client_debug_mask will be built to hold debug keyword/mask - * values fetched from userspace. - */ -struct client_debug_mask { - char *keyword; - __u64 mask1; - __u64 mask2; -}; - -/* * orangefs kernel memory related flags */ @@ -119,29 +109,6 @@ struct client_debug_mask { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */ -/* these functions are defined in orangefs-utils.c */ -int orangefs_prepare_cdm_array(char *debug_array_string); -int orangefs_prepare_debugfs_help_string(int); - -/* defined in orangefs-debugfs.c */ -int orangefs_client_debug_init(void); - -void debug_string_to_mask(char *, void *, int); -void do_c_mask(int, char *, struct client_debug_mask **); -void do_k_mask(int, char *, __u64 **); - -void debug_mask_to_string(void *, int); -void do_k_string(void *, int); -void do_c_string(void *, int); -int check_amalgam_keyword(void *, int); -int keyword_is_amalgam(char *); - -/*these variables are defined in orangefs-mod.c */ -extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -extern unsigned int kernel_mask_set_mod_init; - extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; @@ -331,7 +298,7 @@ struct orangefs_stats { unsigned long writes; }; -extern struct orangefs_stats g_orangefs_stats; +extern struct orangefs_stats orangefs_stats; /* * NOTE: See Documentation/filesystems/porting for information @@ -447,6 +414,8 @@ void purge_waiting_ops(void); /* * defined in super.c */ +extern uint64_t orangefs_features; + struct dentry *orangefs_mount(struct file_system_type *fst, int flags, const char *devname, @@ -506,6 +475,8 @@ ssize_t orangefs_inode_read(struct inode *inode, /* * defined in devorangefs-req.c */ +extern uint32_t orangefs_userspace_version; + int orangefs_dev_init(void); void orangefs_dev_cleanup(void); int is_daemon_in_service(void); @@ -543,20 +514,18 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); int orangefs_normalize_to_errno(__s32 error_code); -extern struct mutex devreq_mutex; -extern struct mutex request_mutex; -extern int debug; +extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; -extern int dcache_timeout_msecs; -extern int getattr_timeout_msecs; +extern int orangefs_dcache_timeout_msecs; +extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; extern spinlock_t orangefs_superblocks_lock; extern struct list_head orangefs_request_list; extern spinlock_t orangefs_request_list_lock; extern wait_queue_head_t orangefs_request_list_waitq; -extern struct list_head *htable_ops_in_progress; -extern spinlock_t htable_ops_in_progress_lock; +extern struct list_head *orangefs_htable_ops_in_progress; +extern spinlock_t orangefs_htable_ops_in_progress_lock; extern int hash_table_size; extern const struct address_space_operations orangefs_address_operations; @@ -611,4 +580,11 @@ static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) #endif } +static inline void orangefs_set_timeout(struct dentry *dentry) +{ + unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; + + dentry->d_fsdata = (void *) time; +} + #endif /* __ORANGEFSKERNEL_H */ diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index e9fd575..2e5b030 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -21,34 +21,17 @@ * global variables declared here */ -/* array of client debug keyword/mask values */ -struct client_debug_mask *cdm_array; -int cdm_element_count; - -char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none"; -char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; -char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN]; - -char *debug_help_string; -int help_string_initialized; -struct dentry *help_file_dentry; -struct dentry *client_debug_dentry; -struct dentry *debug_dir; -int client_verbose_index; -int client_all_index; -struct orangefs_stats g_orangefs_stats; +struct orangefs_stats orangefs_stats; /* the size of the hash tables for ops in progress */ int hash_table_size = 509; static ulong module_parm_debug_mask; -__u64 gossip_debug_mask; -struct client_debug_mask client_debug_mask = { NULL, 0, 0 }; -unsigned int kernel_mask_set_mod_init; /* implicitly false */ +__u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; -int dcache_timeout_msecs = 50; -int getattr_timeout_msecs = 50; +int orangefs_dcache_timeout_msecs = 50; +int orangefs_getattr_timeout_msecs = 50; MODULE_LICENSE("GPL"); MODULE_AUTHOR("ORANGEFS Development Team"); @@ -71,20 +54,17 @@ module_param(module_parm_debug_mask, ulong, 0644); module_param(op_timeout_secs, int, 0); module_param(slot_timeout_secs, int, 0); -/* synchronizes the request device file */ -DEFINE_MUTEX(devreq_mutex); - /* * Blocks non-priority requests from being queued for servicing. This * could be used for protecting the request list data structure, but * for now it's only being used to stall the op addition to the request * list */ -DEFINE_MUTEX(request_mutex); +DEFINE_MUTEX(orangefs_request_mutex); /* hash table for storing operations waiting for matching downcall */ -struct list_head *htable_ops_in_progress; -DEFINE_SPINLOCK(htable_ops_in_progress_lock); +struct list_head *orangefs_htable_ops_in_progress; +DEFINE_SPINLOCK(orangefs_htable_ops_in_progress_lock); /* list for queueing upcall operations */ LIST_HEAD(orangefs_request_list); @@ -100,32 +80,6 @@ static int __init orangefs_init(void) int ret = -1; __u32 i = 0; - /* convert input debug mask to a 64-bit unsigned integer */ - gossip_debug_mask = (unsigned long long) module_parm_debug_mask; - - /* - * set the kernel's gossip debug string; invalid mask values will - * be ignored. - */ - debug_mask_to_string(&gossip_debug_mask, 0); - - /* remove any invalid values from the mask */ - debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0); - - /* - * if the mask has a non-zero value, then indicate that the mask - * was set when the kernel module was loaded. The orangefs dev ioctl - * command will look at this boolean to determine if the kernel's - * debug mask should be overwritten when the client-core is started. - */ - if (gossip_debug_mask != 0) - kernel_mask_set_mod_init = true; - - pr_info("%s: called with debug mask: :%s: :%llx:\n", - __func__, - kernel_debug_string, - (unsigned long long)gossip_debug_mask); - ret = bdi_init(&orangefs_backing_dev_info); if (ret) @@ -146,9 +100,9 @@ static int __init orangefs_init(void) if (ret < 0) goto cleanup_op; - htable_ops_in_progress = + orangefs_htable_ops_in_progress = kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL); - if (!htable_ops_in_progress) { + if (!orangefs_htable_ops_in_progress) { gossip_err("Failed to initialize op hashtable"); ret = -ENOMEM; goto cleanup_inode; @@ -156,7 +110,7 @@ static int __init orangefs_init(void) /* initialize a doubly linked at each hash table index */ for (i = 0; i < hash_table_size; i++) - INIT_LIST_HEAD(&htable_ops_in_progress[i]); + INIT_LIST_HEAD(&orangefs_htable_ops_in_progress[i]); ret = fsid_key_table_initialize(); if (ret < 0) @@ -179,14 +133,10 @@ static int __init orangefs_init(void) if (ret) goto cleanup_key_table; - ret = orangefs_debugfs_init(); + ret = orangefs_debugfs_init(module_parm_debug_mask); if (ret) goto debugfs_init_failed; - ret = orangefs_kernel_debug_init(); - if (ret) - goto kernel_debug_init_failed; - ret = orangefs_sysfs_init(); if (ret) goto sysfs_init_failed; @@ -214,8 +164,6 @@ cleanup_device: sysfs_init_failed: -kernel_debug_init_failed: - debugfs_init_failed: orangefs_debugfs_cleanup(); @@ -223,7 +171,7 @@ cleanup_key_table: fsid_key_table_finalize(); cleanup_progress_table: - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); cleanup_inode: orangefs_inode_cache_finalize(); @@ -250,12 +198,12 @@ static void __exit orangefs_exit(void) orangefs_dev_cleanup(); BUG_ON(!list_empty(&orangefs_request_list)); for (i = 0; i < hash_table_size; i++) - BUG_ON(!list_empty(&htable_ops_in_progress[i])); + BUG_ON(!list_empty(&orangefs_htable_ops_in_progress[i])); orangefs_inode_cache_finalize(); op_cache_finalize(); - kfree(htable_ops_in_progress); + kfree(orangefs_htable_ops_in_progress); bdi_destroy(&orangefs_backing_dev_info); @@ -274,10 +222,10 @@ void purge_inprogress_ops(void) struct orangefs_kernel_op_s *op; struct orangefs_kernel_op_s *next; - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, - &htable_ops_in_progress[i], + &orangefs_htable_ops_in_progress[i], list) { set_op_state_purged(op); gossip_debug(GOSSIP_DEV_DEBUG, @@ -287,7 +235,7 @@ void purge_inprogress_ops(void) op->op_state, current->comm); } - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); } } diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 375708c..a799546 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -73,6 +73,24 @@ * Description: * Time getattr is valid in milliseconds. * + * What: /sys/fs/orangefs/readahead_count + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count. + * + * What: /sys/fs/orangefs/readahead_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer size. + * + * What: /sys/fs/orangefs/readahead_count_size + * Date: Aug 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Readahead cache buffer count and size. + * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg <martin@omnibond.com> @@ -121,159 +139,34 @@ #define PC_KOBJ_ID "pc" #define STATS_KOBJ_ID "stats" -struct orangefs_obj { - struct kobject kobj; - int op_timeout_secs; - int perf_counter_reset; - int perf_history_size; - int perf_time_interval_secs; - int slot_timeout_secs; - int dcache_timeout_msecs; - int getattr_timeout_msecs; -}; - -struct acache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct capcache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ccache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_secs; -}; - -struct ncache_orangefs_obj { - struct kobject kobj; - int hard_limit; - int reclaim_percentage; - int soft_limit; - int timeout_msecs; -}; - -struct pc_orangefs_obj { - struct kobject kobj; - char *acache; - char *capcache; - char *ncache; -}; - -struct stats_orangefs_obj { - struct kobject kobj; - int reads; - int writes; -}; +/* + * Every item calls orangefs_attr_show and orangefs_attr_store through + * orangefs_sysfs_ops. They look at the orangefs_attributes further below to + * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or + * sysfs_service_op_store. + */ struct orangefs_attribute { struct attribute attr; - ssize_t (*show)(struct orangefs_obj *orangefs_obj, + ssize_t (*show)(struct kobject *kobj, struct orangefs_attribute *attr, char *buf); - ssize_t (*store)(struct orangefs_obj *orangefs_obj, + ssize_t (*store)(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count); }; -struct acache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct capcache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ccache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct ncache_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct pc_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - -struct stats_orangefs_attribute { - struct attribute attr; - ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - char *buf); - ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - const char *buf, - size_t count); -}; - static ssize_t orangefs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(orangefs_obj, attribute, buf); - -out: - return rc; + if (!attribute->show) + return -EIO; + return attribute->show(kobj, attribute, buf); } static ssize_t orangefs_attr_store(struct kobject *kobj, @@ -282,24 +175,15 @@ static ssize_t orangefs_attr_store(struct kobject *kobj, size_t len) { struct orangefs_attribute *attribute; - struct orangefs_obj *orangefs_obj; - int rc; - gossip_debug(GOSSIP_SYSFS_DEBUG, - "orangefs_attr_store: start\n"); + if (!strcmp(kobj->name, PC_KOBJ_ID) || + !strcmp(kobj->name, STATS_KOBJ_ID)) + return -EPERM; attribute = container_of(attr, struct orangefs_attribute, attr); - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(orangefs_obj, attribute, buf, len); - -out: - return rc; + if (!attribute->store) + return -EIO; + return attribute->store(kobj, attribute, buf, len); } static const struct sysfs_ops orangefs_sysfs_ops = { @@ -307,402 +191,58 @@ static const struct sysfs_ops orangefs_sysfs_ops = { .store = orangefs_attr_store, }; -static ssize_t acache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(acache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t acache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct acache_orangefs_attribute *attribute; - struct acache_orangefs_obj *acache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "acache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct acache_orangefs_attribute, attr); - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(acache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops acache_orangefs_sysfs_ops = { - .show = acache_orangefs_attr_show, - .store = acache_orangefs_attr_store, -}; - -static ssize_t capcache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(capcache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t capcache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct capcache_orangefs_attribute *attribute; - struct capcache_orangefs_obj *capcache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "capcache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct capcache_orangefs_attribute, attr); - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(capcache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops capcache_orangefs_sysfs_ops = { - .show = capcache_orangefs_attr_show, - .store = capcache_orangefs_attr_store, -}; - -static ssize_t ccache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ccache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ccache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ccache_orangefs_attribute *attribute; - struct ccache_orangefs_obj *ccache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ccache_orangefs_attr_store: start\n"); - - attribute = - container_of(attr, struct ccache_orangefs_attribute, attr); - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ccache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ccache_orangefs_sysfs_ops = { - .show = ccache_orangefs_attr_show, - .store = ccache_orangefs_attr_store, -}; - -static ssize_t ncache_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(ncache_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static ssize_t ncache_orangefs_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t len) -{ - struct ncache_orangefs_attribute *attribute; - struct ncache_orangefs_obj *ncache_orangefs_obj; - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "ncache_orangefs_attr_store: start\n"); - - attribute = container_of(attr, struct ncache_orangefs_attribute, attr); - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - - if (!attribute->store) { - rc = -EIO; - goto out; - } - - rc = attribute->store(ncache_orangefs_obj, attribute, buf, len); - -out: - return rc; -} - -static const struct sysfs_ops ncache_orangefs_sysfs_ops = { - .show = ncache_orangefs_attr_show, - .store = ncache_orangefs_attr_store, -}; - -static ssize_t pc_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct pc_orangefs_attribute *attribute; - struct pc_orangefs_obj *pc_orangefs_obj; - int rc; - - attribute = container_of(attr, struct pc_orangefs_attribute, attr); - pc_orangefs_obj = - container_of(kobj, struct pc_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(pc_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static const struct sysfs_ops pc_orangefs_sysfs_ops = { - .show = pc_orangefs_attr_show, -}; - -static ssize_t stats_orangefs_attr_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct stats_orangefs_attribute *attribute; - struct stats_orangefs_obj *stats_orangefs_obj; - int rc; - - attribute = container_of(attr, struct stats_orangefs_attribute, attr); - stats_orangefs_obj = - container_of(kobj, struct stats_orangefs_obj, kobj); - - if (!attribute->show) { - rc = -EIO; - goto out; - } - - rc = attribute->show(stats_orangefs_obj, attribute, buf); - -out: - return rc; -} - -static const struct sysfs_ops stats_orangefs_sysfs_ops = { - .show = stats_orangefs_attr_show, -}; - -static void orangefs_release(struct kobject *kobj) -{ - struct orangefs_obj *orangefs_obj; - - orangefs_obj = container_of(kobj, struct orangefs_obj, kobj); - kfree(orangefs_obj); -} - -static void acache_orangefs_release(struct kobject *kobj) -{ - struct acache_orangefs_obj *acache_orangefs_obj; - - acache_orangefs_obj = - container_of(kobj, struct acache_orangefs_obj, kobj); - kfree(acache_orangefs_obj); -} - -static void capcache_orangefs_release(struct kobject *kobj) -{ - struct capcache_orangefs_obj *capcache_orangefs_obj; - - capcache_orangefs_obj = - container_of(kobj, struct capcache_orangefs_obj, kobj); - kfree(capcache_orangefs_obj); -} - -static void ccache_orangefs_release(struct kobject *kobj) -{ - struct ccache_orangefs_obj *ccache_orangefs_obj; - - ccache_orangefs_obj = - container_of(kobj, struct ccache_orangefs_obj, kobj); - kfree(ccache_orangefs_obj); -} - -static void ncache_orangefs_release(struct kobject *kobj) -{ - struct ncache_orangefs_obj *ncache_orangefs_obj; - - ncache_orangefs_obj = - container_of(kobj, struct ncache_orangefs_obj, kobj); - kfree(ncache_orangefs_obj); -} - -static void pc_orangefs_release(struct kobject *kobj) -{ - struct pc_orangefs_obj *pc_orangefs_obj; - - pc_orangefs_obj = - container_of(kobj, struct pc_orangefs_obj, kobj); - kfree(pc_orangefs_obj); -} - -static void stats_orangefs_release(struct kobject *kobj) -{ - struct stats_orangefs_obj *stats_orangefs_obj; - - stats_orangefs_obj = - container_of(kobj, struct stats_orangefs_obj, kobj); - kfree(stats_orangefs_obj); -} - -static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr) +static ssize_t sysfs_int_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) { int rc = -EIO; - struct orangefs_attribute *orangefs_attr; - struct stats_orangefs_attribute *stats_orangefs_attr; - gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id); + gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", + kobj->name); - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; - - if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) { + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", op_timeout_secs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", slot_timeout_secs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", - dcache_timeout_msecs); + orangefs_dcache_timeout_msecs); goto out; - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = scnprintf(buf, PAGE_SIZE, "%d\n", - getattr_timeout_msecs); + orangefs_getattr_timeout_msecs); goto out; } else { goto out; } - } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) { - stats_orangefs_attr = (struct stats_orangefs_attribute *)attr; - - if (!strcmp(stats_orangefs_attr->attr.name, "reads")) { + } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "reads")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", - g_orangefs_stats.reads); + orangefs_stats.reads); goto out; - } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) { + } else if (!strcmp(attr->attr.name, "writes")) { rc = scnprintf(buf, PAGE_SIZE, "%lu\n", - g_orangefs_stats.writes); + orangefs_stats.writes); goto out; } else { goto out; @@ -714,45 +254,13 @@ out: return rc; } -static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - char *buf) -{ - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_orangefs_show:start attr->attr.name:%s:\n", - attr->attr.name); - - rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr); - - return rc; -} - -static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj, - struct stats_orangefs_attribute *attr, - char *buf) -{ - int rc; - - gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_stats_show:start attr->attr.name:%s:\n", - attr->attr.name); - - rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr); - - return rc; -} - -static ssize_t int_store(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - const char *buf, - size_t count) +static ssize_t sysfs_int_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) { int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, - "int_store: start attr->attr.name:%s: buf:%s:\n", + "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n", attr->attr.name, buf); if (!strcmp(attr->attr.name, "op_timeout_secs")) { @@ -762,10 +270,10 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj, rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { - rc = kstrtoint(buf, 0, &dcache_timeout_msecs); + rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { - rc = kstrtoint(buf, 0, &getattr_timeout_msecs); + rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs); goto out; } else { goto out; @@ -783,24 +291,19 @@ out: /* * obtain attribute values from userspace with a service operation. */ -static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) +static ssize_t sysfs_service_op_show(struct kobject *kobj, + struct orangefs_attribute *attr, char *buf) { struct orangefs_kernel_op_s *new_op = NULL; int rc = 0; char *ser_op_type = NULL; - struct orangefs_attribute *orangefs_attr; - struct acache_orangefs_attribute *acache_attr; - struct capcache_orangefs_attribute *capcache_attr; - struct ccache_orangefs_attribute *ccache_attr; - struct ncache_orangefs_attribute *ncache_attr; - struct pc_orangefs_attribute *pc_attr; __u32 op_alloc_type; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_show: id:%s:\n", - kobj_id); + kobj->name); - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) op_alloc_type = ORANGEFS_VFS_OP_PARAM; else op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT; @@ -818,124 +321,135 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) goto out; } - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET; - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size"))) { + rc = -EINVAL; + goto out; + } - if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) + if (!strcmp(attr->attr.name, "perf_history_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; - else if (!strcmp(orangefs_attr->attr.name, + else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; - else if (!strcmp(orangefs_attr->attr.name, + else if (!strcmp(attr->attr.name, "perf_counter_reset")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; - } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { - acache_attr = (struct acache_orangefs_attribute *)attr; + else if (!strcmp(attr->attr.name, + "readahead_count")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + + else if (!strcmp(attr->attr.name, + "readahead_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; - if (!strcmp(acache_attr->attr.name, "timeout_msecs")) + else if (!strcmp(attr->attr.name, + "readahead_count_size")) + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; - if (!strcmp(acache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; - if (!strcmp(acache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; - if (!strcmp(acache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) { - capcache_attr = (struct capcache_orangefs_attribute *)attr; - - if (!strcmp(capcache_attr->attr.name, "timeout_secs")) + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; - if (!strcmp(capcache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; - if (!strcmp(capcache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; - if (!strcmp(capcache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) { - ccache_attr = (struct ccache_orangefs_attribute *)attr; - - if (!strcmp(ccache_attr->attr.name, "timeout_secs")) + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; - if (!strcmp(ccache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; - if (!strcmp(ccache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; - if (!strcmp(ccache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) { - ncache_attr = (struct ncache_orangefs_attribute *)attr; - - if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; - if (!strcmp(ncache_attr->attr.name, "hard_limit")) + if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; - if (!strcmp(ncache_attr->attr.name, "soft_limit")) + if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; - if (!strcmp(ncache_attr->attr.name, "reclaim_percentage")) + if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; - } else if (!strcmp(kobj_id, PC_KOBJ_ID)) { - pc_attr = (struct pc_orangefs_attribute *)attr; - - if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID)) + } else if (!strcmp(kobj->name, PC_KOBJ_ID)) { + if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_ACACHE; - if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID)) + if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE; - if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID)) + if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_NCACHE; } else { gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n", - kobj_id); + kobj->name); rc = -EINVAL; goto out; } - if (strcmp(kobj_id, PC_KOBJ_ID)) + if (strcmp(kobj->name, PC_KOBJ_ID)) ser_op_type = "orangefs_param"; else ser_op_type = "orangefs_perf_count"; @@ -948,11 +462,18 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr) out: if (!rc) { - if (strcmp(kobj_id, PC_KOBJ_ID)) { - rc = scnprintf(buf, - PAGE_SIZE, - "%d\n", - (int)new_op->downcall.resp.param.value); + if (strcmp(kobj->name, PC_KOBJ_ID)) { + if (new_op->upcall.req.param.op == + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { + rc = scnprintf(buf, PAGE_SIZE, "%d %d\n", + (int)new_op->downcall.resp.param.u. + value32[0], + (int)new_op->downcall.resp.param.u. + value32[1]); + } else { + rc = scnprintf(buf, PAGE_SIZE, "%d\n", + (int)new_op->downcall.resp.param.u.value64); + } } else { rc = scnprintf( buf, @@ -968,77 +489,6 @@ out: } -static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t service_capcache_show(struct capcache_orangefs_obj - *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t service_ccache_show(struct ccache_orangefs_obj - *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr); - - return rc; -} - -static ssize_t - service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj, - struct pc_orangefs_attribute *attr, - char *buf) -{ - int rc = 0; - - rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr); - - return rc; -} - /* * pass attribute values back to userspace with a service operation. * @@ -1050,20 +500,16 @@ static ssize_t * We want to return 1 if we think everything went OK, and * EINVAL if not. */ -static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) +static ssize_t sysfs_service_op_store(struct kobject *kobj, + struct orangefs_attribute *attr, const char *buf, size_t count) { struct orangefs_kernel_op_s *new_op = NULL; int val = 0; int rc = 0; - struct orangefs_attribute *orangefs_attr; - struct acache_orangefs_attribute *acache_attr; - struct capcache_orangefs_attribute *capcache_attr; - struct ccache_orangefs_attribute *ccache_attr; - struct ncache_orangefs_attribute *ncache_attr; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_store: id:%s:\n", - kobj_id); + kobj->name); new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); if (!new_op) @@ -1079,16 +525,29 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } /* - * The value we want to send back to userspace is in buf. + * The value we want to send back to userspace is in buf, unless this + * there are two parameters, which is specially handled below. */ - rc = kstrtoint(buf, 0, &val); - if (rc) - goto out; + if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) || + strcmp(attr->attr.name, "readahead_count_size")) { + rc = kstrtoint(buf, 0, &val); + if (rc) + goto out; + } - if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) { - orangefs_attr = (struct orangefs_attribute *)attr; + new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; + + if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { + /* Drop unsupported requests first. */ + if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && + (!strcmp(attr->attr.name, "readahead_count") || + !strcmp(attr->attr.name, "readahead_size") || + !strcmp(attr->attr.name, "readahead_count_size"))) { + rc = -EINVAL; + goto out; + } - if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) { + if (!strcmp(attr->attr.name, "perf_history_size")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; @@ -1096,7 +555,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) { if (val > 0) { new_op->upcall.req.param.op = @@ -1105,7 +564,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(orangefs_attr->attr.name, + } else if (!strcmp(attr->attr.name, "perf_counter_reset")) { if ((val == 0) || (val == 1)) { new_op->upcall.req.param.op = @@ -1114,12 +573,55 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } + } else if (!strcmp(attr->attr.name, + "readahead_count")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_size")) { + if ((val >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; + } else { + rc = 0; + goto out; + } + } else if (!strcmp(attr->attr.name, + "readahead_count_size")) { + int val1, val2; + rc = sscanf(buf, "%d %d", &val1, &val2); + if (rc < 2) { + rc = 0; + goto out; + } + if ((val1 >= 0) && (val2 >= 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } + new_op->upcall.req.param.u.value32[0] = val1; + new_op->upcall.req.param.u.value32[1] = val2; + goto value_set; + } else if (!strcmp(attr->attr.name, + "perf_counter_reset")) { + if ((val > 0)) { + new_op->upcall.req.param.op = + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; + } else { + rc = 0; + goto out; + } } - } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) { - acache_attr = (struct acache_orangefs_attribute *)attr; - - if (!strcmp(acache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; @@ -1127,7 +629,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; @@ -1135,7 +637,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1144,7 +646,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) { + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; @@ -1154,10 +656,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) { - capcache_attr = (struct capcache_orangefs_attribute *)attr; - - if (!strcmp(capcache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; @@ -1165,7 +665,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; @@ -1173,7 +673,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1182,7 +682,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) { + } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; @@ -1192,10 +692,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) { - ccache_attr = (struct ccache_orangefs_attribute *)attr; - - if (!strcmp(ccache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; @@ -1203,7 +701,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; @@ -1211,7 +709,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1220,7 +718,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) { + } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; @@ -1230,10 +728,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } } - } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) { - ncache_attr = (struct ncache_orangefs_attribute *)attr; - - if (!strcmp(ncache_attr->attr.name, "hard_limit")) { + } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { + if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; @@ -1241,7 +737,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) { + } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; @@ -1249,7 +745,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, + } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = @@ -1258,7 +754,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc = 0; goto out; } - } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) { + } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; @@ -1270,14 +766,13 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) } else { gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n", - kobj_id); + kobj->name); rc = -EINVAL; goto out; } - new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; - - new_op->upcall.req.param.value = val; + new_op->upcall.req.param.u.value64 = val; +value_set: /* * The service_operation will return a errno return code on @@ -1290,7 +785,7 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr) rc); rc = 0; } else { - rc = 1; + rc = count; } out: @@ -1302,127 +797,56 @@ out: return rc; } -static ssize_t - service_orangefs_store(struct orangefs_obj *orangefs_obj, - struct orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj, - struct acache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_capcache_store(struct capcache_orangefs_obj - *capcache_orangefs_obj, - struct capcache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t service_ccache_store(struct ccache_orangefs_obj - *ccache_orangefs_obj, - struct ccache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - -static ssize_t - service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj, - struct ncache_orangefs_attribute *attr, - const char *buf, - size_t count) -{ - int rc = 0; - - rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr); - - /* rc should have an errno value if the service_op went bad. */ - if (rc == 1) - rc = count; - - return rc; -} - static struct orangefs_attribute op_timeout_secs_attribute = - __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store); + __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute slot_timeout_secs_attribute = - __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store); + __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute dcache_timeout_msecs_attribute = - __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store); + __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute getattr_timeout_msecs_attribute = - __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store); + __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + +static struct orangefs_attribute readahead_count_attribute = + __ATTR(readahead_count, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_size_attribute = + __ATTR(readahead_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); + +static struct orangefs_attribute readahead_count_size_attribute = + __ATTR(readahead_count_size, 0664, sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_history_size_attribute = __ATTR(perf_history_size, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct orangefs_attribute perf_time_interval_secs_attribute = __ATTR(perf_time_interval_secs, 0664, - service_orangefs_show, - service_orangefs_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, + &readahead_count_attribute.attr, + &readahead_size_attribute.attr, + &readahead_count_size_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, @@ -1431,33 +855,32 @@ static struct attribute *orangefs_default_attrs[] = { static struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, - .release = orangefs_release, .default_attrs = orangefs_default_attrs, }; -static struct acache_orangefs_attribute acache_hard_limit_attribute = +static struct orangefs_attribute acache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_reclaim_percent_attribute = +static struct orangefs_attribute acache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_soft_limit_attribute = +static struct orangefs_attribute acache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct acache_orangefs_attribute acache_timeout_msecs_attribute = +static struct orangefs_attribute acache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, - service_acache_show, - service_acache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *acache_orangefs_default_attrs[] = { &acache_hard_limit_attribute.attr, @@ -1468,34 +891,33 @@ static struct attribute *acache_orangefs_default_attrs[] = { }; static struct kobj_type acache_orangefs_ktype = { - .sysfs_ops = &acache_orangefs_sysfs_ops, - .release = acache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = acache_orangefs_default_attrs, }; -static struct capcache_orangefs_attribute capcache_hard_limit_attribute = +static struct orangefs_attribute capcache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute = +static struct orangefs_attribute capcache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_soft_limit_attribute = +static struct orangefs_attribute capcache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct capcache_orangefs_attribute capcache_timeout_secs_attribute = +static struct orangefs_attribute capcache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, - service_capcache_show, - service_capcache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_hard_limit_attribute.attr, @@ -1506,34 +928,33 @@ static struct attribute *capcache_orangefs_default_attrs[] = { }; static struct kobj_type capcache_orangefs_ktype = { - .sysfs_ops = &capcache_orangefs_sysfs_ops, - .release = capcache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = capcache_orangefs_default_attrs, }; -static struct ccache_orangefs_attribute ccache_hard_limit_attribute = +static struct orangefs_attribute ccache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute = +static struct orangefs_attribute ccache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_soft_limit_attribute = +static struct orangefs_attribute ccache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ccache_orangefs_attribute ccache_timeout_secs_attribute = +static struct orangefs_attribute ccache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, - service_ccache_show, - service_ccache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_hard_limit_attribute.attr, @@ -1544,34 +965,33 @@ static struct attribute *ccache_orangefs_default_attrs[] = { }; static struct kobj_type ccache_orangefs_ktype = { - .sysfs_ops = &ccache_orangefs_sysfs_ops, - .release = ccache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = ccache_orangefs_default_attrs, }; -static struct ncache_orangefs_attribute ncache_hard_limit_attribute = +static struct orangefs_attribute ncache_hard_limit_attribute = __ATTR(hard_limit, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute = +static struct orangefs_attribute ncache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_soft_limit_attribute = +static struct orangefs_attribute ncache_soft_limit_attribute = __ATTR(soft_limit, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); -static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute = +static struct orangefs_attribute ncache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, - service_ncache_show, - service_ncache_store); + sysfs_service_op_show, + sysfs_service_op_store); static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_hard_limit_attribute.attr, @@ -1582,27 +1002,26 @@ static struct attribute *ncache_orangefs_default_attrs[] = { }; static struct kobj_type ncache_orangefs_ktype = { - .sysfs_ops = &ncache_orangefs_sysfs_ops, - .release = ncache_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = ncache_orangefs_default_attrs, }; -static struct pc_orangefs_attribute pc_acache_attribute = +static struct orangefs_attribute pc_acache_attribute = __ATTR(acache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); -static struct pc_orangefs_attribute pc_capcache_attribute = +static struct orangefs_attribute pc_capcache_attribute = __ATTR(capcache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); -static struct pc_orangefs_attribute pc_ncache_attribute = +static struct orangefs_attribute pc_ncache_attribute = __ATTR(ncache, 0664, - service_pc_show, + sysfs_service_op_show, NULL); static struct attribute *pc_orangefs_default_attrs[] = { @@ -1613,21 +1032,20 @@ static struct attribute *pc_orangefs_default_attrs[] = { }; static struct kobj_type pc_orangefs_ktype = { - .sysfs_ops = &pc_orangefs_sysfs_ops, - .release = pc_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = pc_orangefs_default_attrs, }; -static struct stats_orangefs_attribute stats_reads_attribute = +static struct orangefs_attribute stats_reads_attribute = __ATTR(reads, 0664, - int_stats_show, + sysfs_int_show, NULL); -static struct stats_orangefs_attribute stats_writes_attribute = +static struct orangefs_attribute stats_writes_attribute = __ATTR(writes, 0664, - int_stats_show, + sysfs_int_show, NULL); static struct attribute *stats_orangefs_default_attrs[] = { @@ -1637,18 +1055,17 @@ static struct attribute *stats_orangefs_default_attrs[] = { }; static struct kobj_type stats_orangefs_ktype = { - .sysfs_ops = &stats_orangefs_sysfs_ops, - .release = stats_orangefs_release, + .sysfs_ops = &orangefs_sysfs_ops, .default_attrs = stats_orangefs_default_attrs, }; -static struct orangefs_obj *orangefs_obj; -static struct acache_orangefs_obj *acache_orangefs_obj; -static struct capcache_orangefs_obj *capcache_orangefs_obj; -static struct ccache_orangefs_obj *ccache_orangefs_obj; -static struct ncache_orangefs_obj *ncache_orangefs_obj; -static struct pc_orangefs_obj *pc_orangefs_obj; -static struct stats_orangefs_obj *stats_orangefs_obj; +static struct kobject *orangefs_obj; +static struct kobject *acache_orangefs_obj; +static struct kobject *capcache_orangefs_obj; +static struct kobject *ccache_orangefs_obj; +static struct kobject *ncache_orangefs_obj; +static struct kobject *pc_orangefs_obj; +static struct kobject *stats_orangefs_obj; int orangefs_sysfs_init(void) { @@ -1661,7 +1078,7 @@ int orangefs_sysfs_init(void) if (!orangefs_obj) goto out; - rc = kobject_init_and_add(&orangefs_obj->kobj, + rc = kobject_init_and_add(orangefs_obj, &orangefs_ktype, fs_kobj, ORANGEFS_KOBJ_ID); @@ -1669,7 +1086,7 @@ int orangefs_sysfs_init(void) if (rc) goto ofs_obj_bail; - kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/acache. */ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL); @@ -1678,15 +1095,15 @@ int orangefs_sysfs_init(void) goto ofs_obj_bail; } - rc = kobject_init_and_add(&acache_orangefs_obj->kobj, + rc = kobject_init_and_add(acache_orangefs_obj, &acache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, ACACHE_KOBJ_ID); if (rc) goto acache_obj_bail; - kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(acache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/capcache. */ capcache_orangefs_obj = @@ -1696,14 +1113,14 @@ int orangefs_sysfs_init(void) goto acache_obj_bail; } - rc = kobject_init_and_add(&capcache_orangefs_obj->kobj, + rc = kobject_init_and_add(capcache_orangefs_obj, &capcache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, CAPCACHE_KOBJ_ID); if (rc) goto capcache_obj_bail; - kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(capcache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ccache. */ ccache_orangefs_obj = @@ -1713,14 +1130,14 @@ int orangefs_sysfs_init(void) goto capcache_obj_bail; } - rc = kobject_init_and_add(&ccache_orangefs_obj->kobj, + rc = kobject_init_and_add(ccache_orangefs_obj, &ccache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, CCACHE_KOBJ_ID); if (rc) goto ccache_obj_bail; - kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(ccache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ncache. */ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL); @@ -1729,15 +1146,15 @@ int orangefs_sysfs_init(void) goto ccache_obj_bail; } - rc = kobject_init_and_add(&ncache_orangefs_obj->kobj, + rc = kobject_init_and_add(ncache_orangefs_obj, &ncache_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, NCACHE_KOBJ_ID); if (rc) goto ncache_obj_bail; - kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(ncache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/perf_counters. */ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL); @@ -1746,15 +1163,15 @@ int orangefs_sysfs_init(void) goto ncache_obj_bail; } - rc = kobject_init_and_add(&pc_orangefs_obj->kobj, + rc = kobject_init_and_add(pc_orangefs_obj, &pc_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, "perf_counters"); if (rc) goto pc_obj_bail; - kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(pc_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/stats. */ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL); @@ -1763,37 +1180,31 @@ int orangefs_sysfs_init(void) goto pc_obj_bail; } - rc = kobject_init_and_add(&stats_orangefs_obj->kobj, + rc = kobject_init_and_add(stats_orangefs_obj, &stats_orangefs_ktype, - &orangefs_obj->kobj, + orangefs_obj, STATS_KOBJ_ID); if (rc) goto stats_obj_bail; - kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD); + kobject_uevent(stats_orangefs_obj, KOBJ_ADD); goto out; stats_obj_bail: - kobject_put(&stats_orangefs_obj->kobj); - + kobject_put(stats_orangefs_obj); pc_obj_bail: - kobject_put(&pc_orangefs_obj->kobj); - + kobject_put(pc_orangefs_obj); ncache_obj_bail: - kobject_put(&ncache_orangefs_obj->kobj); - + kobject_put(ncache_orangefs_obj); ccache_obj_bail: - kobject_put(&ccache_orangefs_obj->kobj); - + kobject_put(ccache_orangefs_obj); capcache_obj_bail: - kobject_put(&capcache_orangefs_obj->kobj); - + kobject_put(capcache_orangefs_obj); acache_obj_bail: - kobject_put(&acache_orangefs_obj->kobj); - + kobject_put(acache_orangefs_obj); ofs_obj_bail: - kobject_put(&orangefs_obj->kobj); + kobject_put(orangefs_obj); out: return rc; } @@ -1801,13 +1212,11 @@ out: void orangefs_sysfs_exit(void) { gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n"); - - kobject_put(&acache_orangefs_obj->kobj); - kobject_put(&capcache_orangefs_obj->kobj); - kobject_put(&ccache_orangefs_obj->kobj); - kobject_put(&ncache_orangefs_obj->kobj); - kobject_put(&pc_orangefs_obj->kobj); - kobject_put(&stats_orangefs_obj->kobj); - - kobject_put(&orangefs_obj->kobj); + kobject_put(acache_orangefs_obj); + kobject_put(capcache_orangefs_obj); + kobject_put(ccache_orangefs_obj); + kobject_put(ncache_orangefs_obj); + kobject_put(pc_orangefs_obj); + kobject_put(stats_orangefs_obj); + kobject_put(orangefs_obj); } diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d13c729..06af81f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op) case ORANGEFS_VFS_OP_TRUNCATE: fsid = op->upcall.req.truncate.refn.fs_id; break; - case ORANGEFS_VFS_OP_MMAP_RA_FLUSH: + case ORANGEFS_VFS_OP_RA_FLUSH: fsid = op->upcall.req.ra_cache_flush.refn.fs_id; break; case ORANGEFS_VFS_OP_FS_UMOUNT: @@ -347,7 +347,8 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); - orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000; + orangefs_inode->getattr_time = jiffies + + orangefs_getattr_timeout_msecs*HZ/1000; ret = 0; out: op_release(new_op); @@ -656,401 +657,3 @@ __s32 ORANGEFS_util_translate_mode(int mode) return ret; } #undef NUM_MODES - -/* - * After obtaining a string representation of the client's debug - * keywords and their associated masks, this function is called to build an - * array of these values. - */ -int orangefs_prepare_cdm_array(char *debug_array_string) -{ - int i; - int rc = -EINVAL; - char *cds_head = NULL; - char *cds_delimiter = NULL; - int keyword_len = 0; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - /* - * figure out how many elements the cdm_array needs. - */ - for (i = 0; i < strlen(debug_array_string); i++) - if (debug_array_string[i] == '\n') - cdm_element_count++; - - if (!cdm_element_count) { - pr_info("No elements in client debug array string!\n"); - goto out; - } - - cdm_array = - kzalloc(cdm_element_count * sizeof(struct client_debug_mask), - GFP_KERNEL); - if (!cdm_array) { - pr_info("malloc failed for cdm_array!\n"); - rc = -ENOMEM; - goto out; - } - - cds_head = debug_array_string; - - for (i = 0; i < cdm_element_count; i++) { - cds_delimiter = strchr(cds_head, '\n'); - *cds_delimiter = '\0'; - - keyword_len = strcspn(cds_head, " "); - - cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL); - if (!cdm_array[i].keyword) { - rc = -ENOMEM; - goto out; - } - - sscanf(cds_head, - "%s %llx %llx", - cdm_array[i].keyword, - (unsigned long long *)&(cdm_array[i].mask1), - (unsigned long long *)&(cdm_array[i].mask2)); - - if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE)) - client_verbose_index = i; - - if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL)) - client_all_index = i; - - cds_head = cds_delimiter + 1; - } - - rc = cdm_element_count; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc); - -out: - - return rc; - -} - -/* - * /sys/kernel/debug/orangefs/debug-help can be catted to - * see all the available kernel and client debug keywords. - * - * When the kernel boots, we have no idea what keywords the - * client supports, nor their associated masks. - * - * We pass through this function once at boot and stamp a - * boilerplate "we don't know" message for the client in the - * debug-help file. We pass through here again when the client - * starts and then we can fill out the debug-help file fully. - * - * The client might be restarted any number of times between - * reboots, we only build the debug-help file the first time. - */ -int orangefs_prepare_debugfs_help_string(int at_boot) -{ - int rc = -EINVAL; - int i; - int byte_count = 0; - char *client_title = "Client Debug Keywords:\n"; - char *kernel_title = "Kernel Debug Keywords:\n"; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (at_boot) { - byte_count += strlen(HELP_STRING_UNINITIALIZED); - client_title = HELP_STRING_UNINITIALIZED; - } else { - /* - * fill the client keyword/mask array and remember - * how many elements there were. - */ - cdm_element_count = - orangefs_prepare_cdm_array(client_debug_array_string); - if (cdm_element_count <= 0) - goto out; - - /* Count the bytes destined for debug_help_string. */ - byte_count += strlen(client_title); - - for (i = 0; i < cdm_element_count; i++) { - byte_count += strlen(cdm_array[i].keyword + 2); - if (byte_count >= DEBUG_HELP_STRING_SIZE) { - pr_info("%s: overflow 1!\n", __func__); - goto out; - } - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "%s: cdm_element_count:%d:\n", - __func__, - cdm_element_count); - } - - byte_count += strlen(kernel_title); - for (i = 0; i < num_kmod_keyword_mask_map; i++) { - byte_count += - strlen(s_kmod_keyword_mask_map[i].keyword + 2); - if (byte_count >= DEBUG_HELP_STRING_SIZE) { - pr_info("%s: overflow 2!\n", __func__); - goto out; - } - } - - /* build debug_help_string. */ - debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL); - if (!debug_help_string) { - rc = -ENOMEM; - goto out; - } - - strcat(debug_help_string, client_title); - - if (!at_boot) { - for (i = 0; i < cdm_element_count; i++) { - strcat(debug_help_string, "\t"); - strcat(debug_help_string, cdm_array[i].keyword); - strcat(debug_help_string, "\n"); - } - } - - strcat(debug_help_string, "\n"); - strcat(debug_help_string, kernel_title); - - for (i = 0; i < num_kmod_keyword_mask_map; i++) { - strcat(debug_help_string, "\t"); - strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword); - strcat(debug_help_string, "\n"); - } - - rc = 0; - -out: - - return rc; - -} - -/* - * kernel = type 0 - * client = type 1 - */ -void debug_mask_to_string(void *mask, int type) -{ - int i; - int len = 0; - char *debug_string; - int element_count = 0; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (type) { - debug_string = client_debug_string; - element_count = cdm_element_count; - } else { - debug_string = kernel_debug_string; - element_count = num_kmod_keyword_mask_map; - } - - memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); - - /* - * Some keywords, like "all" or "verbose", are amalgams of - * numerous other keywords. Make a special check for those - * before grinding through the whole mask only to find out - * later... - */ - if (check_amalgam_keyword(mask, type)) - goto out; - - /* Build the debug string. */ - for (i = 0; i < element_count; i++) - if (type) - do_c_string(mask, i); - else - do_k_string(mask, i); - - len = strlen(debug_string); - - if ((len) && (type)) - client_debug_string[len - 1] = '\0'; - else if (len) - kernel_debug_string[len - 1] = '\0'; - else if (type) - strcpy(client_debug_string, "none"); - else - strcpy(kernel_debug_string, "none"); - -out: -gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string); - - return; - -} - -void do_k_string(void *k_mask, int index) -{ - __u64 *mask = (__u64 *) k_mask; - - if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword)) - goto out; - - if (*mask & s_kmod_keyword_mask_map[index].mask_val) { - if ((strlen(kernel_debug_string) + - strlen(s_kmod_keyword_mask_map[index].keyword)) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) { - strcat(kernel_debug_string, - s_kmod_keyword_mask_map[index].keyword); - strcat(kernel_debug_string, ","); - } else { - gossip_err("%s: overflow!\n", __func__); - strcpy(kernel_debug_string, ORANGEFS_ALL); - goto out; - } - } - -out: - - return; -} - -void do_c_string(void *c_mask, int index) -{ - struct client_debug_mask *mask = (struct client_debug_mask *) c_mask; - - if (keyword_is_amalgam(cdm_array[index].keyword)) - goto out; - - if ((mask->mask1 & cdm_array[index].mask1) || - (mask->mask2 & cdm_array[index].mask2)) { - if ((strlen(client_debug_string) + - strlen(cdm_array[index].keyword) + 1) - < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) { - strcat(client_debug_string, - cdm_array[index].keyword); - strcat(client_debug_string, ","); - } else { - gossip_err("%s: overflow!\n", __func__); - strcpy(client_debug_string, ORANGEFS_ALL); - goto out; - } - } -out: - return; -} - -int keyword_is_amalgam(char *keyword) -{ - int rc = 0; - - if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE))) - rc = 1; - - return rc; -} - -/* - * kernel = type 0 - * client = type 1 - * - * return 1 if we found an amalgam. - */ -int check_amalgam_keyword(void *mask, int type) -{ - __u64 *k_mask; - struct client_debug_mask *c_mask; - int k_all_index = num_kmod_keyword_mask_map - 1; - int rc = 0; - - if (type) { - c_mask = (struct client_debug_mask *) mask; - - if ((c_mask->mask1 == cdm_array[client_all_index].mask1) && - (c_mask->mask2 == cdm_array[client_all_index].mask2)) { - strcpy(client_debug_string, ORANGEFS_ALL); - rc = 1; - goto out; - } - - if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) && - (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) { - strcpy(client_debug_string, ORANGEFS_VERBOSE); - rc = 1; - goto out; - } - - } else { - k_mask = (__u64 *) mask; - - if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) { - strcpy(kernel_debug_string, ORANGEFS_ALL); - rc = 1; - goto out; - } - } - -out: - - return rc; -} - -/* - * kernel = type 0 - * client = type 1 - */ -void debug_string_to_mask(char *debug_string, void *mask, int type) -{ - char *unchecked_keyword; - int i; - char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL); - char *original_pointer; - int element_count = 0; - struct client_debug_mask *c_mask; - __u64 *k_mask; - - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__); - - if (type) { - c_mask = (struct client_debug_mask *)mask; - element_count = cdm_element_count; - } else { - k_mask = (__u64 *)mask; - *k_mask = 0; - element_count = num_kmod_keyword_mask_map; - } - - original_pointer = strsep_fodder; - while ((unchecked_keyword = strsep(&strsep_fodder, ","))) - if (strlen(unchecked_keyword)) { - for (i = 0; i < element_count; i++) - if (type) - do_c_mask(i, - unchecked_keyword, - &c_mask); - else - do_k_mask(i, - unchecked_keyword, - &k_mask); - } - - kfree(original_pointer); -} - -void do_c_mask(int i, - char *unchecked_keyword, - struct client_debug_mask **sane_mask) -{ - - if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) { - (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1; - (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2; - } -} - -void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask) -{ - - if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword)) - **sane_mask = (**sane_mask) | - s_kmod_keyword_mask_map[i].mask_val; -} diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 3d7418c..971307a 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -4,26 +4,6 @@ #include <linux/slab.h> #include <linux/ioctl.h> -extern struct client_debug_mask *cdm_array; -extern char *debug_help_string; -extern int help_string_initialized; -extern struct dentry *debug_dir; -extern struct dentry *help_file_dentry; -extern struct dentry *client_debug_dentry; -extern const struct file_operations debug_help_fops; -extern int client_all_index; -extern int client_verbose_index; -extern int cdm_element_count; -#define DEBUG_HELP_STRING_SIZE 4096 -#define HELP_STRING_UNINITIALIZED \ - "Client Debug Keywords are unknown until the first time\n" \ - "the client is started after boot.\n" -#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help" -#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug" -#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug" -#define ORANGEFS_VERBOSE "verbose" -#define ORANGEFS_ALL "all" - /* pvfs2-config.h ***********************************************************/ #define ORANGEFS_VERSION_MAJOR 2 #define ORANGEFS_VERSION_MINOR 9 @@ -426,13 +406,12 @@ do { \ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) #else -extern __u64 gossip_debug_mask; -extern struct client_debug_mask client_debug_mask; +extern __u64 orangefs_gossip_debug_mask; /* try to avoid function call overhead by checking masks in macro */ #define gossip_debug(mask, fmt, ...) \ do { \ - if (gossip_debug_mask & (mask)) \ + if (orangefs_gossip_debug_mask & (mask)) \ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ } while (0) #endif /* GOSSIP_DISABLE_DEBUG */ diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index b9da9a0..c48859f 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -33,6 +33,7 @@ static const match_table_t tokens = { { Opt_err, NULL } }; +uint64_t orangefs_features; static int parse_mount_options(struct super_block *sb, char *options, int silent) @@ -249,6 +250,19 @@ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) } op_release(new_op); + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return -ENOMEM; + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", 0); + orangefs_features = new_op->downcall.resp.features.features; + op_release(new_op); + } else { + orangefs_features = 0; + } + return ret; } @@ -492,6 +506,19 @@ struct dentry *orangefs_mount(struct file_system_type *fst, list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); spin_unlock(&orangefs_superblocks_lock); op_release(new_op); + + if (orangefs_userspace_version >= 20906) { + new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); + if (!new_op) + return ERR_PTR(-ENOMEM); + new_op->upcall.req.features.features = 0; + ret = service_operation(new_op, "orangefs_features", 0); + orangefs_features = new_op->downcall.resp.features.features; + op_release(new_op); + } else { + orangefs_features = 0; + } + return dget(sb->s_root); free_op: @@ -530,8 +557,8 @@ void orangefs_kill_sb(struct super_block *sb) * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us * gets completed before we free the dang thing. */ - mutex_lock(&request_mutex); - mutex_unlock(&request_mutex); + mutex_lock(&orangefs_request_mutex); + mutex_unlock(&orangefs_request_mutex); /* free the orangefs superblock private data */ kfree(ORANGEFS_SB(sb)); diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index 8fecf82..10b0b06 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -14,6 +14,5 @@ const struct inode_operations orangefs_symlink_inode_operations = { .setattr = orangefs_setattr, .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, - .setxattr = generic_setxattr, .permission = orangefs_permission, }; diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h index 001b202..af0b0e3 100644 --- a/fs/orangefs/upcall.h +++ b/fs/orangefs/upcall.h @@ -98,7 +98,7 @@ struct orangefs_truncate_request_s { __s64 size; }; -struct orangefs_mmap_ra_cache_flush_request_s { +struct orangefs_ra_cache_flush_request_s { struct orangefs_object_kref refn; }; @@ -179,12 +179,18 @@ enum orangefs_param_request_op { ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23, ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24, ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27, + ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28, }; struct orangefs_param_request_s { enum orangefs_param_request_type type; enum orangefs_param_request_op op; - __s64 value; + union { + __s64 value64; + __s32 value32[2]; + } u; char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN]; }; @@ -204,6 +210,11 @@ struct orangefs_fs_key_request_s { __s32 __pad1; }; +/* 2.9.6 */ +struct orangefs_features_request_s { + __u64 features; +}; + struct orangefs_upcall_s { __s32 type; __u32 uid; @@ -228,7 +239,7 @@ struct orangefs_upcall_s { struct orangefs_rename_request_s rename; struct orangefs_statfs_request_s statfs; struct orangefs_truncate_request_s truncate; - struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush; + struct orangefs_ra_cache_flush_request_s ra_cache_flush; struct orangefs_fs_mount_request_s fs_mount; struct orangefs_fs_umount_request_s fs_umount; struct orangefs_getxattr_request_s getxattr; @@ -240,6 +251,7 @@ struct orangefs_upcall_s { struct orangefs_param_request_s param; struct orangefs_perf_count_request_s perf_count; struct orangefs_fs_key_request_s fs_key; + struct orangefs_features_request_s features; } req; }; diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 31635bc..abcfa3f 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -87,9 +87,9 @@ retry_servicing: */ if (!(flags & ORANGEFS_OP_NO_MUTEX)) { if (flags & ORANGEFS_OP_INTERRUPTIBLE) - ret = mutex_lock_interruptible(&request_mutex); + ret = mutex_lock_interruptible(&orangefs_request_mutex); else - ret = mutex_lock_killable(&request_mutex); + ret = mutex_lock_killable(&orangefs_request_mutex); /* * check to see if we were interrupted while waiting for * mutex @@ -129,7 +129,7 @@ retry_servicing: spin_unlock(&orangefs_request_list_lock); if (!(flags & ORANGEFS_OP_NO_MUTEX)) - mutex_unlock(&request_mutex); + mutex_unlock(&orangefs_request_mutex); ret = wait_for_matching_downcall(op, timeout, flags & ORANGEFS_OP_INTERRUPTIBLE); @@ -272,9 +272,9 @@ static void } else if (op_state_in_progress(op)) { /* op must be removed from the in progress htable */ spin_unlock(&op->lock); - spin_lock(&htable_ops_in_progress_lock); + spin_lock(&orangefs_htable_ops_in_progress_lock); list_del_init(&op->list); - spin_unlock(&htable_ops_in_progress_lock); + spin_unlock(&orangefs_htable_ops_in_progress_lock); gossip_debug(GOSSIP_WAIT_DEBUG, "Interrupted: Removed op %p" " from htable_ops_in_progress\n", diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 2a9f07f..74a81b1 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -73,6 +73,9 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { gossip_err("Invalid key length (%d)\n", (int)strlen(name)); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 43fdc27..aeb60f7 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -57,9 +57,10 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int uninitialized_var(error); + size_t slen; - if (!old->d_inode->i_op->getxattr || - !new->d_inode->i_op->getxattr) + if (!(old->d_inode->i_opflags & IOP_XATTR) || + !(new->d_inode->i_opflags & IOP_XATTR)) return 0; list_size = vfs_listxattr(old, NULL, 0); @@ -79,7 +80,16 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) goto out; } - for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + if (ovl_is_private_xattr(name)) continue; retry: @@ -105,6 +115,13 @@ retry: goto retry; } + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } error = vfs_setxattr(new, name, value, size, 0); if (error) break; @@ -167,40 +184,6 @@ out_fput: return error; } -static char *ovl_read_symlink(struct dentry *realdentry) -{ - int res; - char *buf; - struct inode *inode = realdentry->d_inode; - mm_segment_t old_fs; - - res = -EINVAL; - if (!inode->i_op->readlink) - goto err; - - res = -ENOMEM; - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - goto err; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = inode->i_op->readlink(realdentry, - (char __user *)buf, PAGE_SIZE - 1); - set_fs(old_fs); - if (res < 0) { - free_page((unsigned long) buf); - goto err; - } - buf[res] = '\0'; - - return buf; - -err: - return ERR_PTR(res); -} - static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { @@ -248,6 +231,8 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *upper = NULL; umode_t mode = stat->mode; int err; + const struct cred *old_creds = NULL; + struct cred *new_creds = NULL; newdentry = ovl_lookup_temp(workdir, dentry); err = PTR_ERR(newdentry); @@ -260,10 +245,23 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (IS_ERR(upper)) goto out1; + err = security_inode_copy_up(dentry, &new_creds); + if (err < 0) + goto out2; + + if (new_creds) + old_creds = override_creds(new_creds); + /* Can't properly set mode on creation because of the umask */ stat->mode &= S_IFMT; err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); stat->mode = mode; + + if (new_creds) { + revert_creds(old_creds); + put_cred(new_creds); + } + if (err) goto out2; @@ -332,19 +330,20 @@ out_cleanup: int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct path *lowerpath, struct kstat *stat) { + DEFINE_DELAYED_CALL(done); struct dentry *workdir = ovl_workdir(dentry); int err; struct kstat pstat; struct path parentpath; + struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; struct dentry *upperdentry; - const struct cred *old_cred; - char *link = NULL; + const char *link = NULL; if (WARN_ON(!workdir)) return -EROFS; - ovl_do_check_copy_up(lowerpath->dentry); + ovl_do_check_copy_up(lowerdentry); ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; @@ -354,13 +353,11 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return err; if (S_ISLNK(stat->mode)) { - link = ovl_read_symlink(lowerpath->dentry); + link = vfs_get_link(lowerdentry, &done); if (IS_ERR(link)) return PTR_ERR(link); } - old_cred = ovl_override_creds(dentry->d_sb); - err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { pr_err("overlayfs: failed to lock workdir+upperdir\n"); @@ -381,19 +378,16 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } out_unlock: unlock_rename(workdir, upperdir); - revert_creds(old_cred); - - if (link) - free_page((unsigned long) link); + do_delayed_call(&done); return err; } int ovl_copy_up(struct dentry *dentry) { - int err; + int err = 0; + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); - err = 0; while (!err) { struct dentry *next; struct dentry *parent; @@ -425,6 +419,7 @@ int ovl_copy_up(struct dentry *dentry) dput(parent); dput(next); } + revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 1560fdc..306b6c1 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -37,8 +38,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) { struct dentry *temp; char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); - snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { @@ -489,6 +492,15 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (override_cred) { override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; + if (!hardlink) { + err = security_dentry_create_files_as(dentry, + stat->mode, &dentry->d_name, old_cred, + override_cred); + if (err) { + put_cred(override_cred); + goto out_revert_creds; + } + } put_cred(override_creds(override_cred)); put_cred(override_cred); @@ -499,6 +511,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, err = ovl_create_over_whiteout(dentry, inode, stat, link, hardlink); } +out_revert_creds: revert_creds(old_cred); if (!err) { struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); @@ -996,17 +1009,14 @@ const struct inode_operations ovl_dir_inode_operations = { .symlink = ovl_symlink, .unlink = ovl_unlink, .rmdir = ovl_rmdir, - .rename2 = ovl_rename2, + .rename = ovl_rename2, .link = ovl_link, .setattr = ovl_setattr, .create = ovl_create, .mknod = ovl_mknod, .permission = ovl_permission, .getattr = ovl_dir_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c75625c..c58f01b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,6 +19,7 @@ static int ovl_copy_up_truncate(struct dentry *dentry) struct dentry *parent; struct kstat stat; struct path lowerpath; + const struct cred *old_cred; parent = dget_parent(dentry); err = ovl_copy_up(parent); @@ -26,12 +27,14 @@ static int ovl_copy_up_truncate(struct dentry *dentry) goto out_dput_parent; ovl_path_lower(dentry, &lowerpath); - err = vfs_getattr(&lowerpath, &stat); - if (err) - goto out_dput_parent; - stat.size = 0; - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&lowerpath, &stat); + if (!err) { + stat.size = 0; + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + } + revert_creds(old_cred); out_dput_parent: dput(parent); @@ -53,7 +56,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not * check for a swapfile (which this won't be anyway). */ - err = inode_change_ok(dentry->d_inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -153,45 +156,18 @@ static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct dentry *realdentry; - struct inode *realinode; const struct cred *old_cred; const char *p; if (!dentry) return ERR_PTR(-ECHILD); - realdentry = ovl_dentry_real(dentry); - realinode = realdentry->d_inode; - - if (WARN_ON(!realinode->i_op->get_link)) - return ERR_PTR(-EPERM); - old_cred = ovl_override_creds(dentry->d_sb); - p = realinode->i_op->get_link(realdentry, realinode, done); + p = vfs_get_link(ovl_dentry_real(dentry), done); revert_creds(old_cred); return p; } -static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -{ - struct path realpath; - struct inode *realinode; - const struct cred *old_cred; - int err; - - ovl_path_real(dentry, &realpath); - realinode = realpath.dentry->d_inode; - - if (!realinode->i_op->readlink) - return -EINVAL; - - old_cred = ovl_override_creds(dentry->d_sb); - err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz); - revert_creds(old_cred); - return err; -} - bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PREFIX, @@ -367,10 +343,7 @@ static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = generic_removexattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, }; @@ -378,12 +351,9 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, - .readlink = ovl_readlink, + .readlink = generic_readlink, .getattr = ovl_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ovl_listxattr, - .removexattr = generic_removexattr, .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5813ccf..e218e74 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -114,13 +114,13 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, { int err; - pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); if (err) { - pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); } return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e2a94a2..bcf3965 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,12 +273,11 @@ static bool ovl_is_opaquedir(struct dentry *dentry) { int res; char val; - struct inode *inode = dentry->d_inode; - if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + if (!d_is_dir(dentry)) return false; - res = inode->i_op->getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1); + res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -419,16 +418,12 @@ static bool ovl_dentry_weird(struct dentry *dentry) DCACHE_OP_COMPARE); } -static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb, - struct dentry *dir, +static inline struct dentry *ovl_lookup_real(struct dentry *dir, const struct qstr *name) { - const struct cred *old_cred; struct dentry *dentry; - old_cred = ovl_override_creds(ovl_sb); dentry = lookup_one_len_unlocked(name->name, dir, name->len); - revert_creds(old_cred); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -469,6 +464,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; + const struct cred *old_cred; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; @@ -479,9 +475,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; + old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name); + this = ovl_lookup_real(upperdir, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) goto out; @@ -514,8 +511,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - this = ovl_lookup_real(dentry->d_sb, - lowerpath.dentry, &dentry->d_name); + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { /* @@ -588,6 +584,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_copyattr(realdentry->d_inode, inode); } + revert_creds(old_cred); oe->opaque = upperopaque; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); @@ -606,6 +603,7 @@ out_put: out_put_upper: dput(upperdentry); out: + revert_creds(old_cred); return ERR_PTR(err); } @@ -834,6 +832,19 @@ retry: if (err) goto out_dput; + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; @@ -1292,6 +1303,12 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!oe) goto out_put_cred; + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_xattr = ovl_xattr_handlers; + sb->s_fs_info = ufs; + sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR)); if (!root_dentry) goto out_free_oe; @@ -1315,12 +1332,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry); ovl_copyattr(realinode, d_inode(root_dentry)); - sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_op = &ovl_super_operations; - sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; - sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL; return 0; @@ -267,7 +267,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (bufs) { int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; - const struct pipe_buf_operations *ops = buf->ops; size_t chars = buf->len; size_t written; int error; @@ -275,7 +274,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (chars > total_len) chars = total_len; - error = ops->confirm(pipe, buf); + error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; @@ -299,8 +298,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) } if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; @@ -383,11 +381,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; - const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; - if (ops->can_merge && offset + chars <= PAGE_SIZE) { - ret = ops->confirm(pipe, buf); + if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { + ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; @@ -604,54 +601,63 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static void account_pipe_buffers(struct pipe_inode_info *pipe, +static unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { - atomic_long_add(new - old, &pipe->user->pipe_bufs); + return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(struct user_struct *user) +static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; + return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; } -static bool too_many_pipe_buffers_hard(struct user_struct *user) +static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; + return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + unsigned long user_bufs; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); - if (pipe) { - unsigned long pipe_bufs = PIPE_DEF_BUFFERS; - struct user_struct *user = get_current_user(); - - if (!too_many_pipe_buffers_hard(user)) { - if (too_many_pipe_buffers_soft(user)) - pipe_bufs = 1; - pipe->bufs = kcalloc(pipe_bufs, - sizeof(struct pipe_buffer), - GFP_KERNEL_ACCOUNT); - } + if (pipe == NULL) + goto out_free_uid; - if (pipe->bufs) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = pipe_bufs; - pipe->user = user; - account_pipe_buffers(pipe, 0, pipe_bufs); - mutex_init(&pipe->mutex); - return pipe; - } - free_uid(user); - kfree(pipe); + if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = pipe_max_size >> PAGE_SHIFT; + + user_bufs = account_pipe_buffers(user, 0, pipe_bufs); + + if (too_many_pipe_buffers_soft(user_bufs)) { + user_bufs = account_pipe_buffers(user, pipe_bufs, 1); + pipe_bufs = 1; + } + + if (too_many_pipe_buffers_hard(user_bufs)) + goto out_revert_acct; + + pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); + + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->buffers = pipe_bufs; + pipe->user = user; + mutex_init(&pipe->mutex); + return pipe; } +out_revert_acct: + (void) account_pipe_buffers(user, pipe_bufs, 0); + kfree(pipe); +out_free_uid: + free_uid(user); return NULL; } @@ -659,12 +665,12 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - account_pipe_buffers(pipe, pipe->buffers, 0); + (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(pipe, buf); + pipe_buf_release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); @@ -716,7 +722,7 @@ static struct inode * get_pipe_inode(void) inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); return inode; @@ -1011,12 +1017,54 @@ const struct file_operations pipefifo_fops = { }; /* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; + unsigned int size, nr_pages; + unsigned long user_bufs; + long ret = 0; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + if (!nr_pages) + return -EINVAL; + + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_pages > pipe->buffers && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + + if (nr_pages > pipe->buffers && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_revert_acct; + } /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't @@ -1024,13 +1072,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (nr_pages < pipe->nrbufs) - return -EBUSY; + if (nr_pages < pipe->nrbufs) { + ret = -EBUSY; + goto out_revert_acct; + } bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) - return -ENOMEM; + if (unlikely(!bufs)) { + ret = -ENOMEM; + goto out_revert_acct; + } /* * The pipe array wraps around, so just start the new one at zero @@ -1053,24 +1105,15 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } - account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->buffers = nr_pages; return nr_pages * PAGE_SIZE; -} - -/* - * Currently we rely on the pipe array holding a power-of-2 number - * of pages. - */ -static inline unsigned int round_pipe_size(unsigned int size) -{ - unsigned long nr_pages; - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +out_revert_acct: + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + return ret; } /* @@ -1112,28 +1155,9 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) __pipe_lock(pipe); switch (cmd) { - case F_SETPIPE_SZ: { - unsigned int size, nr_pages; - - size = round_pipe_size(arg); - nr_pages = size >> PAGE_SHIFT; - - ret = -EINVAL; - if (!nr_pages) - goto out; - - if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { - ret = -EPERM; - goto out; - } else if ((too_many_pipe_buffers_hard(pipe->user) || - too_many_pipe_buffers_soft(pipe->user)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto out; - } - ret = pipe_set_size(pipe, nr_pages); + case F_SETPIPE_SZ: + ret = pipe_set_size(pipe, arg); break; - } case F_GETPIPE_SZ: ret = pipe->buffers * PAGE_SIZE; break; @@ -1142,7 +1166,6 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) break; } -out: __pipe_unlock(pipe); return ret; } @@ -259,7 +259,7 @@ static int propagate_one(struct mount *m) read_sequnlock_excl(&mount_lock); } hlist_add_head(&child->mnt_hash, list); - return 0; + return count_mounts(m->mnt_ns, child); } /* @@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); +int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 59d47ab0..5955220 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -598,13 +598,14 @@ posix_acl_create(struct inode *dir, umode_t *mode, if (IS_ERR(p)) return PTR_ERR(p); + ret = -ENOMEM; clone = posix_acl_clone(p, GFP_NOFS); if (!clone) - goto no_mem; + goto err_release; ret = posix_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto err_release_clone; if (ret == 0) posix_acl_release(clone); @@ -618,14 +619,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +err_release_clone: posix_acl_release(clone); -no_mem: +err_release: posix_acl_release(p); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(posix_acl_create); +/** + * posix_acl_update_mode - update mode in set_acl + * + * Update the file mode when setting an ACL: compute the new file permission + * bits based on the ACL. In addition, if the ACL is equivalent to the new + * file mode, set *acl to NULL to indicate that no ACL should be set. + * + * As with chmod, clear the setgit bit if the caller is not in the owning group + * or capable of CAP_FSETID (see inode_change_ok). + * + * Called from set_acl inode operations. + */ +int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +EXPORT_SYMBOL(posix_acl_update_mode); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ @@ -633,15 +665,15 @@ static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, void *value, size_t size) { - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; kuid_t uid; kgid_t gid; if (!value) return; - if (size < sizeof(posix_acl_xattr_header)) + if (size < sizeof(struct posix_acl_xattr_header)) return; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return; @@ -691,15 +723,15 @@ struct posix_acl * posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, size_t size) { - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + const struct posix_acl_xattr_header *header = value; + const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; int count; struct posix_acl *acl; struct posix_acl_entry *acl_e; if (!value) return NULL; - if (size < sizeof(posix_acl_xattr_header)) + if (size < sizeof(struct posix_acl_xattr_header)) return ERR_PTR(-EINVAL); if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return ERR_PTR(-EOPNOTSUPP); @@ -760,8 +792,8 @@ int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry; + struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); @@ -770,7 +802,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, if (real_size > size) return -ERANGE; - ext_entry = ext_acl->a_entries; + ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { @@ -897,7 +929,7 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) acl = NULL; } - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 88c7de1..81818ad 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -186,51 +186,45 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); rcu_read_unlock(); - seq_printf(m, - "State:\t%s\n" - "Tgid:\t%d\n" - "Ngid:\t%d\n" - "Pid:\t%d\n" - "PPid:\t%d\n" - "TracerPid:\t%d\n" - "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n" - "FDSize:\t%d\nGroups:\t", - get_task_state(p), - tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, - from_kuid_munged(user_ns, cred->uid), - from_kuid_munged(user_ns, cred->euid), - from_kuid_munged(user_ns, cred->suid), - from_kuid_munged(user_ns, cred->fsuid), - from_kgid_munged(user_ns, cred->gid), - from_kgid_munged(user_ns, cred->egid), - from_kgid_munged(user_ns, cred->sgid), - from_kgid_munged(user_ns, cred->fsgid), - max_fds); - + seq_printf(m, "State:\t%s", get_task_state(p)); + + seq_put_decimal_ull(m, "\nTgid:\t", tgid); + seq_put_decimal_ull(m, "\nNgid:\t", ngid); + seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns)); + seq_put_decimal_ull(m, "\nPPid:\t", ppid); + seq_put_decimal_ull(m, "\nTracerPid:\t", tpid); + seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid)); + seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid)); + seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); + seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); + seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); + + seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) - seq_printf(m, "%d ", - from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(user_ns, group_info->gid[g])); put_cred(cred); + /* Trailing space shouldn't have been added in the first place. */ + seq_putc(m, ' '); #ifdef CONFIG_PID_NS seq_puts(m, "\nNStgid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSpgid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns)); seq_puts(m, "\nNSsid:"); for (g = ns->level; g <= pid->level; g++) - seq_printf(m, "\t%d", - task_session_nr_ns(p, pid->numbers[g].ns)); + seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); } @@ -299,11 +293,12 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unlock_task_sighand(p, &flags); } - seq_printf(m, "Threads:\t%d\n", num_threads); - seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); + seq_put_decimal_ull(m, "Threads:\t", num_threads); + seq_put_decimal_ull(m, "\nSigQ:\t", qsize); + seq_put_decimal_ull(m, "/", qlim); /* render them all */ - render_sigset_t(m, "SigPnd:\t", &pending); + render_sigset_t(m, "\nSigPnd:\t", &pending); render_sigset_t(m, "ShdPnd:\t", &shpending); render_sigset_t(m, "SigBlk:\t", &blocked); render_sigset_t(m, "SigIgn:\t", &ignored); @@ -348,17 +343,17 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { #ifdef CONFIG_SECCOMP - seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); + seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); + seq_putc(m, '\n'); #endif } static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { - seq_printf(m, "voluntary_ctxt_switches:\t%lu\n" - "nonvoluntary_ctxt_switches:\t%lu\n", - p->nvcsw, - p->nivcsw); + seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw); + seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw); + seq_putc(m, '\n'); } static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) @@ -417,10 +412,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); - if (permitted) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); - } + /* + * esp and eip are intentionally zeroed out. There is no + * non-racy way to read them without freezing the task. + * Programs that need reliable values can use ptrace(2). + */ } get_task_comm(tcomm, task); @@ -490,41 +486,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, start_time = nsec_to_clock_t(task->real_start_time); seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); - seq_put_decimal_ll(m, ' ', ppid); - seq_put_decimal_ll(m, ' ', pgid); - seq_put_decimal_ll(m, ' ', sid); - seq_put_decimal_ll(m, ' ', tty_nr); - seq_put_decimal_ll(m, ' ', tty_pgrp); - seq_put_decimal_ull(m, ' ', task->flags); - seq_put_decimal_ull(m, ' ', min_flt); - seq_put_decimal_ull(m, ' ', cmin_flt); - seq_put_decimal_ull(m, ' ', maj_flt); - seq_put_decimal_ull(m, ' ', cmaj_flt); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); - seq_put_decimal_ll(m, ' ', priority); - seq_put_decimal_ll(m, ' ', nice); - seq_put_decimal_ll(m, ' ', num_threads); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', start_time); - seq_put_decimal_ull(m, ' ', vsize); - seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); - seq_put_decimal_ull(m, ' ', rsslim); - seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); - seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); - seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); - seq_put_decimal_ull(m, ' ', esp); - seq_put_decimal_ull(m, ' ', eip); + seq_put_decimal_ll(m, " ", ppid); + seq_put_decimal_ll(m, " ", pgid); + seq_put_decimal_ll(m, " ", sid); + seq_put_decimal_ll(m, " ", tty_nr); + seq_put_decimal_ll(m, " ", tty_pgrp); + seq_put_decimal_ull(m, " ", task->flags); + seq_put_decimal_ull(m, " ", min_flt); + seq_put_decimal_ull(m, " ", cmin_flt); + seq_put_decimal_ull(m, " ", maj_flt); + seq_put_decimal_ull(m, " ", cmaj_flt); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, " ", priority); + seq_put_decimal_ll(m, " ", nice); + seq_put_decimal_ll(m, " ", num_threads); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", start_time); + seq_put_decimal_ull(m, " ", vsize); + seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, " ", rsslim); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, " ", esp); + seq_put_decimal_ull(m, " ", eip); /* The signal information here is obsolete. * It must be decimal for Linux 2.0 compatibility. * Use /proc/#/status for real-time signals. */ - seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL); /* * We used to output the absolute kernel address, but that's an @@ -538,31 +534,31 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, else seq_puts(m, " 0"); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ll(m, ' ', task->exit_signal); - seq_put_decimal_ll(m, ' ', task_cpu(task)); - seq_put_decimal_ull(m, ' ', task->rt_priority); - seq_put_decimal_ull(m, ' ', task->policy); - seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); - seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); - seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ll(m, " ", task->exit_signal); + seq_put_decimal_ll(m, " ", task_cpu(task)); + seq_put_decimal_ull(m, " ", task->rt_priority); + seq_put_decimal_ull(m, " ", task->policy); + seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime)); if (mm && permitted) { - seq_put_decimal_ull(m, ' ', mm->start_data); - seq_put_decimal_ull(m, ' ', mm->end_data); - seq_put_decimal_ull(m, ' ', mm->start_brk); - seq_put_decimal_ull(m, ' ', mm->arg_start); - seq_put_decimal_ull(m, ' ', mm->arg_end); - seq_put_decimal_ull(m, ' ', mm->env_start); - seq_put_decimal_ull(m, ' ', mm->env_end); + seq_put_decimal_ull(m, " ", mm->start_data); + seq_put_decimal_ull(m, " ", mm->end_data); + seq_put_decimal_ull(m, " ", mm->start_brk); + seq_put_decimal_ull(m, " ", mm->arg_start); + seq_put_decimal_ull(m, " ", mm->arg_end); + seq_put_decimal_ull(m, " ", mm->env_start); + seq_put_decimal_ull(m, " ", mm->env_end); } else - seq_printf(m, " 0 0 0 0 0 0 0"); + seq_puts(m, " 0 0 0 0 0 0 0"); if (permitted) - seq_put_decimal_ll(m, ' ', task->exit_code); + seq_put_decimal_ll(m, " ", task->exit_code); else - seq_put_decimal_ll(m, ' ', 0); + seq_puts(m, " 0"); seq_putc(m, '\n'); if (mm) @@ -598,13 +594,13 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", * size, resident, shared, text, data); */ - seq_put_decimal_ull(m, 0, size); - seq_put_decimal_ull(m, ' ', resident); - seq_put_decimal_ull(m, ' ', shared); - seq_put_decimal_ull(m, ' ', text); - seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', data); - seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); seq_putc(m, '\n'); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index ac0df4d..ca651ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -400,23 +400,6 @@ static const struct file_operations proc_pid_cmdline_ops = { .llseek = generic_file_llseek, }; -static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) -{ - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (mm && !IS_ERR(mm)) { - unsigned int nwords = 0; - do { - nwords += 2; - } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ - seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); - mmput(mm); - return 0; - } else - return PTR_ERR(mm); -} - - #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. @@ -483,7 +466,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pS\n", + seq_printf(m, "[<%pK>] %pB\n", (void *)entries[i], (void *)entries[i]); } unlock_trace(task); @@ -709,7 +692,7 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -849,6 +832,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags; if (!mm) return 0; @@ -861,6 +845,11 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ + flags = FOLL_FORCE; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -869,7 +858,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -981,8 +970,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { ret = retval; @@ -1014,6 +1002,33 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static int auxv_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); +} + +static ssize_t auxv_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mm_struct *mm = file->private_data; + unsigned int nwords = 0; + + if (!mm) + return 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv, + nwords * sizeof(mm->saved_auxv[0])); +} + +static const struct file_operations proc_auxv_operations = { + .open = auxv_open, + .read = auxv_read, + .llseek = generic_file_llseek, + .release = mem_release, +}; + static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1664,7 +1679,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t /* Common stuff */ ei = PROC_I(inode); inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_op = &proc_def_inode_operations; /* @@ -2280,16 +2295,27 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; - task_unlock(p); - } else - count = -EPERM; + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + count = -EPERM; + goto out; + } + + err = security_task_setscheduler(p); + if (err) { + count = err; + goto out; + } + } + + task_lock(p); + if (slack_ns == 0) + p->timer_slack_ns = p->default_timer_slack_ns; + else + p->timer_slack_ns = slack_ns; + task_unlock(p); +out: put_task_struct(p); return count; @@ -2299,19 +2325,28 @@ static int timerslack_ns_show(struct seq_file *m, void *v) { struct inode *inode = m->private; struct task_struct *p; - int err = 0; + int err = 0; p = get_proc_task(inode); if (!p) return -ESRCH; - if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) { - task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); - task_unlock(p); - } else - err = -EPERM; + if (p != current) { + if (!capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + err = security_task_getscheduler(p); + if (err) + goto out; + } + + task_lock(p); + seq_printf(m, "%llu\n", p->timer_slack_ns); + task_unlock(p); + +out: put_task_struct(p); return err; @@ -2822,7 +2857,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - ONE("auxv", S_IRUSR, proc_pid_auxv), + REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), @@ -3210,7 +3245,7 @@ static const struct pid_entry tid_base_stuff[] = { DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif REG("environ", S_IRUSR, proc_environ_operations), - ONE("auxv", S_IRUSR, proc_pid_auxv), + REG("auxv", S_IRUSR, proc_auxv_operations), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 01df23c..d21dafe 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -31,7 +31,7 @@ static int seq_show(struct seq_file *m, void *v) put_task_struct(task); if (files) { - int fd = proc_fd(m->private); + unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); file = fcheck_files(files, fd); @@ -86,7 +86,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) struct task_struct *task; const struct cred *cred; struct inode *inode; - int fd; + unsigned int fd; if (flags & LOOKUP_RCU) return -ECHILD; @@ -158,7 +158,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) } if (files) { - int fd = proc_fd(d_inode(dentry)); + unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); @@ -253,7 +253,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, continue; rcu_read_unlock(); - len = snprintf(name, sizeof(name), "%d", fd); + len = snprintf(name, sizeof(name), "%u", fd); if (!proc_fill_cache(file, ctx, name, len, instantiate, p, (void *)(unsigned long)fd)) diff --git a/fs/proc/fd.h b/fs/proc/fd.h index 7c047f2..46dafad 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -11,7 +11,7 @@ extern const struct inode_operations proc_fdinfo_inode_operations; extern int proc_fd_permission(struct inode *inode, int mask); -static inline int proc_fd(struct inode *inode) +static inline unsigned int proc_fd(struct inode *inode) { return PROC_I(inode)->fd; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index c633476..5f2dc20 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -105,7 +105,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) struct proc_dir_entry *de = PDE(inode); int error; - error = inode_change_ok(inode, iattr); + error = setattr_prepare(dentry, iattr); if (error) return error; @@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); + proc_set_user(ent, (*parent)->uid, (*parent)->gid); + out: return ent; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index c1b7238..e69ebe6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -68,7 +68,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->sysctl_entry = NULL; ei->ns_ops = NULL; inode = &ei->vfs_inode; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; } @@ -421,7 +420,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (inode) { inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7931c55..5378441 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -60,7 +60,7 @@ union proc_op { struct proc_inode { struct pid *pid; - int fd; + unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b9a8c81..8a42849 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -23,6 +23,25 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } +static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) +{ + char v[32]; + static const char blanks[7] = {' ', ' ', ' ', ' ',' ', ' ', ' '}; + int len; + + len = num_to_str(v, sizeof(v), num << (PAGE_SHIFT - 10)); + + seq_write(m, s, 16); + + if (len > 0) { + if (len < 8) + seq_write(m, blanks, 8 - len); + + seq_write(m, v, len); + } + seq_write(m, " kB\n", 4); +} + static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; @@ -32,10 +51,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long pages[NR_LRU_LISTS]; int lru; -/* - * display in kilobytes. - */ -#define K(x) ((x) << (PAGE_SHIFT - 10)) si_meminfo(&i); si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); @@ -50,136 +65,100 @@ static int meminfo_proc_show(struct seq_file *m, void *v) available = si_mem_available(); - /* - * Tagged format, for easy grepping and expansion. - */ - seq_printf(m, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "MemAvailable: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" - "Active(anon): %8lu kB\n" - "Inactive(anon): %8lu kB\n" - "Active(file): %8lu kB\n" - "Inactive(file): %8lu kB\n" - "Unevictable: %8lu kB\n" - "Mlocked: %8lu kB\n" -#ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" -#endif -#ifndef CONFIG_MMU - "MmapCopy: %8lu kB\n" -#endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Shmem: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "KernelStack: %8lu kB\n" - "PageTables: %8lu kB\n" -#ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" -#endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n" -#ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %5lu kB\n" -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - "AnonHugePages: %8lu kB\n" - "ShmemHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" -#endif -#ifdef CONFIG_CMA - "CmaTotal: %8lu kB\n" - "CmaFree: %8lu kB\n" -#endif - , - K(i.totalram), - K(i.freeram), - K(available), - K(i.bufferram), - K(cached), - K(total_swapcache_pages()), - K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), - K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), - K(pages[LRU_ACTIVE_ANON]), - K(pages[LRU_INACTIVE_ANON]), - K(pages[LRU_ACTIVE_FILE]), - K(pages[LRU_INACTIVE_FILE]), - K(pages[LRU_UNEVICTABLE]), - K(global_page_state(NR_MLOCK)), + show_val_kb(m, "MemTotal: ", i.totalram); + show_val_kb(m, "MemFree: ", i.freeram); + show_val_kb(m, "MemAvailable: ", available); + show_val_kb(m, "Buffers: ", i.bufferram); + show_val_kb(m, "Cached: ", cached); + show_val_kb(m, "SwapCached: ", total_swapcache_pages()); + show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + + pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + + pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); + show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); + show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); + show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); + show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); + show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + #ifdef CONFIG_HIGHMEM - K(i.totalhigh), - K(i.freehigh), - K(i.totalram-i.totalhigh), - K(i.freeram-i.freehigh), + show_val_kb(m, "HighTotal: ", i.totalhigh); + show_val_kb(m, "HighFree: ", i.freehigh); + show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); + show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); #endif + #ifndef CONFIG_MMU - K((unsigned long) atomic_long_read(&mmap_pages_allocated)), + show_val_kb(m, "MmapCopy: ", + (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif - K(i.totalswap), - K(i.freeswap), - K(global_node_page_state(NR_FILE_DIRTY)), - K(global_node_page_state(NR_WRITEBACK)), - K(global_node_page_state(NR_ANON_MAPPED)), - K(global_node_page_state(NR_FILE_MAPPED)), - K(i.sharedram), - K(global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)), - K(global_page_state(NR_SLAB_RECLAIMABLE)), - K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK_KB), - K(global_page_state(NR_PAGETABLE)), + + show_val_kb(m, "SwapTotal: ", i.totalswap); + show_val_kb(m, "SwapFree: ", i.freeswap); + show_val_kb(m, "Dirty: ", + global_node_page_state(NR_FILE_DIRTY)); + show_val_kb(m, "Writeback: ", + global_node_page_state(NR_WRITEBACK)); + show_val_kb(m, "AnonPages: ", + global_node_page_state(NR_ANON_MAPPED)); + show_val_kb(m, "Mapped: ", + global_node_page_state(NR_FILE_MAPPED)); + show_val_kb(m, "Shmem: ", i.sharedram); + show_val_kb(m, "Slab: ", + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)); + + show_val_kb(m, "SReclaimable: ", + global_page_state(NR_SLAB_RECLAIMABLE)); + show_val_kb(m, "SUnreclaim: ", + global_page_state(NR_SLAB_UNRECLAIMABLE)); + seq_printf(m, "KernelStack: %8lu kB\n", + global_page_state(NR_KERNEL_STACK_KB)); + show_val_kb(m, "PageTables: ", + global_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST - K(quicklist_total_size()), + show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif - K(global_node_page_state(NR_UNSTABLE_NFS)), - K(global_page_state(NR_BOUNCE)), - K(global_node_page_state(NR_WRITEBACK_TEMP)), - K(vm_commit_limit()), - K(committed), - (unsigned long)VMALLOC_TOTAL >> 10, - 0ul, // used to be vmalloc 'used' - 0ul // used to be vmalloc 'largest_chunk' + + show_val_kb(m, "NFS_Unstable: ", + global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "Bounce: ", + global_page_state(NR_BOUNCE)); + show_val_kb(m, "WritebackTmp: ", + global_node_page_state(NR_WRITEBACK_TEMP)); + show_val_kb(m, "CommitLimit: ", vm_commit_limit()); + show_val_kb(m, "Committed_AS: ", committed); + seq_printf(m, "VmallocTotal: %8lu kB\n", + (unsigned long)VMALLOC_TOTAL >> 10); + show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocChunk: ", 0ul); + #ifdef CONFIG_MEMORY_FAILURE - , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) + seq_printf(m, "HardwareCorrupted: %5lu kB\n", + atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) - , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) - , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) + show_val_kb(m, "AnonHugePages: ", + global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "ShmemHugePages: ", + global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "ShmemPmdMapped: ", + global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); #endif + #ifdef CONFIG_CMA - , K(totalcma_pages) - , K(global_page_state(NR_FREE_CMA_PAGES)) + show_val_kb(m, "CmaTotal: ", totalcma_pages); + show_val_kb(m, "CmaFree: ", + global_page_state(NR_FREE_CMA_PAGES)); #endif - ); hugetlb_report_meminfo(m); arch_report_meminfo(m); return 0; -#undef K } static int meminfo_proc_open(struct inode *inode, struct file *file) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index c8bbc68..7ae6b1d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -21,6 +21,7 @@ #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> +#include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> @@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = { static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; + kuid_t uid; + kgid_t gid; int err; err = -ENOMEM; @@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net) netd->parent = &proc_root; memcpy(netd->name, "net", 4); + uid = make_kuid(net->user_ns, 0); + if (!uid_valid(uid)) + uid = netd->uid; + + gid = make_kgid(net->user_ns, 0); + if (!gid_valid(gid)) + gid = netd->gid; + + proc_set_user(netd, uid, gid); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1b93650..55313d9 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces); + struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head) } static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) - set = root->lookup(root, namespaces); + set = root->lookup(root); return set; } @@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { + struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; @@ -444,7 +445,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, ei->sysctl = head; ei->sysctl_entry = table; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (is_empty_dir(head)) make_empty_dir_inode(inode); } + + if (root->set_ownership) + root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); + out: return inode; } @@ -491,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; if (S_ISLNK(p->mode)) { - ret = sysctl_follow_link(&h, &p, current->nsproxy); + ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; @@ -659,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file, if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table, current->nsproxy); + int err = sysctl_follow_link(&head, &table); if (err) goto out; } @@ -754,7 +759,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -976,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces) + struct ctl_table **pentry) { struct ctl_table_header *head; struct ctl_table_root *root; @@ -988,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; - set = lookup_header_set(root, namespaces); + set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); diff --git a/fs/proc/self.c b/fs/proc/self.c index b6a8d35..4024595 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -56,7 +56,7 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode_pseudo(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 7907e45..d700c42 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -115,17 +115,16 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_puts(p, "cpu "); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { @@ -141,23 +140,23 @@ static int show_stat(struct seq_file *p, void *v) guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); - seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); seq_putc(p, '\n'); } - seq_printf(p, "intr %llu", (unsigned long long)sum); + seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j)); + seq_put_decimal_ull(p, " ", kstat_irqs_usr(j)); seq_printf(p, "\nctxt %llu\n" @@ -171,10 +170,10 @@ static int show_stat(struct seq_file *p, void *v) nr_running(), nr_iowait()); - seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) - seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); + seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6fa99e..35b92d8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -147,7 +147,7 @@ m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) { if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; + m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; } static void *m_start(struct seq_file *m, loff_t *ppos) @@ -175,8 +175,10 @@ static void *m_start(struct seq_file *m, loff_t *ppos) priv->tail_vma = get_gate_vma(mm); if (last_addr) { - vma = find_vma(mm, last_addr); - if (vma && (vma = m_next_vma(priv, vma))) + vma = find_vma(mm, last_addr - 1); + if (vma && vma->vm_start <= last_addr) + vma = m_next_vma(priv, vma); + if (vma) return vma; } @@ -264,24 +266,15 @@ static int do_maps_open(struct inode *inode, struct file *file, * /proc/PID/maps that is the stack of the main task. */ static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static void @@ -352,7 +345,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma, is_pid)) + if (is_stack(priv, vma)) name = "[stack]"; } @@ -1070,7 +1063,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } mmu_notifier_invalidate_range_start(mm, 0, -1); } - walk_page_range(0, ~0UL, &clear_refs_walk); + walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); @@ -1667,7 +1660,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma, is_pid)) { + } else if (is_stack(proc_priv, vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index faacb0c..3717562 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -124,25 +124,17 @@ unsigned long task_statm(struct mm_struct *mm, } static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; } /* @@ -184,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma, is_pid)) { + } else if (mm && is_stack(priv, vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index e58a31e..595b90a97 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -58,7 +58,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode_pseudo(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ec9ddef..1781dc5 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -230,7 +230,7 @@ static struct inode *pstore_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); } return inode; } diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 16ecca5..14984d9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -623,6 +623,40 @@ static int pstore_write_compat(enum pstore_type_id type, size, psi); } +static int pstore_write_buf_user_compat(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + unsigned long flags = 0; + size_t i, bufsize = size; + long ret = 0; + + if (unlikely(!access_ok(VERIFY_READ, buf, size))) + return -EFAULT; + if (bufsize > psinfo->bufsize) + bufsize = psinfo->bufsize; + spin_lock_irqsave(&psinfo->buf_lock, flags); + for (i = 0; i < size; ) { + size_t c = min(size - i, bufsize); + + ret = __copy_from_user(psinfo->buf, buf + i, c); + if (unlikely(ret != 0)) { + ret = -EFAULT; + break; + } + ret = psi->write_buf(type, reason, id, part, psinfo->buf, + compressed, c, psi); + if (unlikely(ret < 0)) + break; + i += c; + } + spin_unlock_irqrestore(&psinfo->buf_lock, flags); + return unlikely(ret < 0) ? ret : size; +} + /* * platform specific persistent storage driver registers with * us here. If pstore is already mounted, call the platform @@ -645,6 +679,8 @@ int pstore_register(struct pstore_info *psi) if (!psi->write) psi->write = pstore_write_compat; + if (!psi->write_buf_user) + psi->write_buf_user = pstore_write_buf_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); @@ -659,13 +695,14 @@ int pstore_register(struct pstore_info *psi) if (pstore_is_mounted()) pstore_get_records(0); - pstore_register_kmsg(); - - if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + if (psi->flags & PSTORE_FLAGS_DMESG) + pstore_register_kmsg(); + if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); + if (psi->flags & PSTORE_FLAGS_FTRACE) pstore_register_ftrace(); + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_register_pmsg(); - } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + @@ -689,12 +726,14 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); + if (psi->flags & PSTORE_FLAGS_FTRACE) pstore_unregister_ftrace(); + if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_unregister_console(); - } - pstore_unregister_kmsg(); + if (psi->flags & PSTORE_FLAGS_DMESG) + pstore_unregister_kmsg(); free_buf_for_compression(); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 7de20cd..78f6176 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -19,48 +19,25 @@ #include "internal.h" static DEFINE_MUTEX(pmsg_lock); -#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - size_t i, buffer_size; - char *buffer; + u64 id; + int ret; if (!count) return 0; + /* check outside lock, page in any data. write_buf_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; - buffer_size = count; - if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) - buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; - buffer = vmalloc(buffer_size); - if (!buffer) - return -ENOMEM; - mutex_lock(&pmsg_lock); - for (i = 0; i < count; ) { - size_t c = min(count - i, buffer_size); - u64 id; - long ret; - - ret = __copy_from_user(buffer, buf + i, c); - if (unlikely(ret != 0)) { - mutex_unlock(&pmsg_lock); - vfree(buffer); - return -EFAULT; - } - psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, - psinfo); - - i += c; - } - + ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, + psinfo); mutex_unlock(&pmsg_lock); - vfree(buffer); - return count; + return ret ? ret : count; } static const struct file_operations pmsg_fops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 7a034d6..6ad831b 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -331,6 +331,24 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return 0; } +static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, + const char __user *buf, + bool compressed, size_t size, + struct pstore_info *psi) +{ + if (type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = psi->data; + + if (!cxt->mprz) + return -ENOMEM; + return persistent_ram_write_user(cxt->mprz, buf, size); + } + + return -EINVAL; +} + static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, struct timespec time, struct pstore_info *psi) { @@ -369,6 +387,7 @@ static struct ramoops_context oops_cxt = { .open = ramoops_pstore_open, .read = ramoops_pstore_read, .write_buf = ramoops_pstore_write_buf, + .write_buf_user = ramoops_pstore_write_buf_user, .erase = ramoops_pstore_erase, }, }; @@ -377,13 +396,14 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; - cxt->max_dump_cnt = 0; if (!cxt->przs) return; - for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++) + for (i = 0; i < cxt->max_dump_cnt; i++) persistent_ram_free(cxt->przs[i]); + kfree(cxt->przs); + cxt->max_dump_cnt = 0; } static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, @@ -408,7 +428,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, GFP_KERNEL); if (!cxt->przs) { dev_err(dev, "failed to initialize a prz array for dumps\n"); - goto fail_prz; + goto fail_mem; } for (i = 0; i < cxt->max_dump_cnt; i++) { @@ -419,6 +439,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", cxt->record_size, (unsigned long long)*paddr, err); + + while (i > 0) { + i--; + persistent_ram_free(cxt->przs[i]); + } goto fail_prz; } *paddr += cxt->record_size; @@ -426,7 +451,9 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, return 0; fail_prz: - ramoops_free_przs(cxt); + kfree(cxt->przs); +fail_mem: + cxt->max_dump_cnt = 0; return err; } @@ -608,12 +635,20 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.bufsize = 1024; /* LOG_LINE_MAX */ cxt->pstore.bufsize = max(cxt->record_size, cxt->pstore.bufsize); cxt->pstore.buf = kmalloc(cxt->pstore.bufsize, GFP_KERNEL); - spin_lock_init(&cxt->pstore.buf_lock); if (!cxt->pstore.buf) { pr_err("cannot allocate pstore buffer\n"); err = -ENOMEM; goto fail_clear; } + spin_lock_init(&cxt->pstore.buf_lock); + + cxt->pstore.flags = PSTORE_FLAGS_DMESG; + if (cxt->console_size) + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + if (cxt->ftrace_size) + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + if (cxt->pmsg_size) + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; err = pstore_register(&cxt->pstore); if (err) { @@ -659,7 +694,6 @@ static int ramoops_remove(struct platform_device *pdev) struct ramoops_context *cxt = &oops_cxt; pstore_unregister(&cxt->pstore); - cxt->max_dump_cnt = 0; kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 76c3f80..3975dee 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,15 +17,16 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> -#include <linux/kernel.h> #include <linux/init.h> #include <linux/io.h> +#include <linux/kernel.h> #include <linux/list.h> #include <linux/memblock.h> +#include <linux/pstore_ram.h> #include <linux/rslib.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> -#include <linux/pstore_ram.h> #include <asm/page.h> struct persistent_ram_buffer { @@ -47,43 +48,10 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) return atomic_read(&prz->buffer->start); } -/* increase and wrap the start pointer, returning the old value */ -static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) -{ - int old; - int new; - - do { - old = atomic_read(&prz->buffer->start); - new = old + a; - while (unlikely(new >= prz->buffer_size)) - new -= prz->buffer_size; - } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); - - return old; -} - -/* increase the size counter until it hits the max size */ -static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a) -{ - size_t old; - size_t new; - - if (atomic_read(&prz->buffer->size) == prz->buffer_size) - return; - - do { - old = atomic_read(&prz->buffer->size); - new = old + a; - if (new > prz->buffer_size) - new = prz->buffer_size; - } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); -} - static DEFINE_RAW_SPINLOCK(buffer_lock); /* increase and wrap the start pointer, returning the old value */ -static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) +static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) { int old; int new; @@ -103,7 +71,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) } /* increase the size counter until it hits the max size */ -static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) +static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; @@ -124,9 +92,6 @@ exit: raw_spin_unlock_irqrestore(&buffer_lock, flags); } -static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic; -static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic; - static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { @@ -299,10 +264,20 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, const void *s, unsigned int start, unsigned int count) { struct persistent_ram_buffer *buffer = prz->buffer; - memcpy(buffer->data + start, s, count); + memcpy_toio(buffer->data + start, s, count); persistent_ram_update_ecc(prz, start, count); } +static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int start, unsigned int count) +{ + struct persistent_ram_buffer *buffer = prz->buffer; + int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + -EFAULT : 0; + persistent_ram_update_ecc(prz, start, count); + return ret; +} + void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; @@ -322,8 +297,8 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) } prz->old_log_size = size; - memcpy(prz->old_log, &buffer->data[start], size - start); - memcpy(prz->old_log + size - start, &buffer->data[0], start); + memcpy_fromio(prz->old_log, &buffer->data[start], size - start); + memcpy_fromio(prz->old_log + size - start, &buffer->data[0], start); } int notrace persistent_ram_write(struct persistent_ram_zone *prz, @@ -356,6 +331,38 @@ int notrace persistent_ram_write(struct persistent_ram_zone *prz, return count; } +int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, + const void __user *s, unsigned int count) +{ + int rem, ret = 0, c = count; + size_t start; + + if (unlikely(!access_ok(VERIFY_READ, s, count))) + return -EFAULT; + if (unlikely(c > prz->buffer_size)) { + s += c - prz->buffer_size; + c = prz->buffer_size; + } + + buffer_size_add(prz, c); + + start = buffer_start_add(prz, c); + + rem = prz->buffer_size - start; + if (unlikely(rem < c)) { + ret = persistent_ram_update_user(prz, s, start, rem); + s += rem; + c -= rem; + start = 0; + } + if (likely(!ret)) + ret = persistent_ram_update_user(prz, s, start, c); + + persistent_ram_update_header_ecc(prz); + + return unlikely(ret) ? ret : count; +} + size_t persistent_ram_old_size(struct persistent_ram_zone *prz) { return prz->old_log_size; @@ -426,9 +433,6 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size, return NULL; } - buffer_start_add = buffer_start_add_locked; - buffer_size_add = buffer_size_add_locked; - if (memtype) va = ioremap(start, size); else diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 35df08e..2d44542 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -341,6 +341,7 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -365,17 +366,19 @@ static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { /* * Q_XGETQSTAT doesn't have room for both group and project * quotas. So, allow the project quota values to be copied out @@ -411,6 +414,7 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) struct qc_state state; int ret; + memset(&state, 0, sizeof (struct qc_state)); ret = sb->s_qcop->get_state(sb, &state); if (ret < 0) return ret; @@ -435,17 +439,19 @@ static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; - if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + + /* Inodes may be allocated even if inactive; copy out if present */ + if (state.s_state[USRQUOTA].ino) { fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; } - if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[GRPQUOTA].ino) { fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; } - if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + if (state.s_state[PRJQUOTA].ino) { fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index be3ddd1..2bcbf4e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -169,7 +169,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = inode_change_ok(inode, ia); + ret = setattr_prepare(dentry, ia); if (ret) return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 1ab6e6c..8621c03 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &ramfs_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -100,7 +100,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); } return error; } @@ -130,7 +130,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * if (!error) { d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); } else iput(inode); } diff --git a/fs/read_write.c b/fs/read_write.c index 66215a7..190e0d36 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,6 +730,35 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) +/** + * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace + * into the kernel and check that it is valid. + * + * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @fast_pointer. + * @fast_pointer: Pointer to (usually small on-stack) kernel array. + * @ret_pointer: (output parameter) Pointer to a variable that will point to + * either @fast_pointer, a newly allocated kernel array, or NULL, + * depending on which array was used. + * + * This function copies an array of &struct iovec of @nr_segs from + * userspace into the kernel and checks that each element is valid (e.g. + * it does not point to a kernel address or cause overflow by being too + * large, etc.). + * + * As an optimization, the caller may provide a pointer to a small + * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long + * (the size of this array, or 0 if unused, should be given in @fast_segs). + * + * @ret_pointer will always point to the array that was used, so the + * caller must take care not to call kfree() on it e.g. in case the + * @fast_pointer array was used and it was allocated on the stack. + * + * Return: The total number of bytes covered by the iovec array on success + * or a negative error code on error. + */ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 90f815b..2f8c5c9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -260,10 +260,7 @@ const struct file_operations reiserfs_file_operations = { const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c2c59f9..58b2ded 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2005,7 +2005,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, if (S_ISLNK(inode->i_mode)) inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_size = i_size; inode->i_blocks = 0; inode->i_bytes = 0; @@ -3312,7 +3312,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid; int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 2f1ddc9..1f4692a 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -94,7 +94,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); setflags_out: mnt_drop_write_file(filp); @@ -115,7 +115,7 @@ setflags_out: err = -EFAULT; goto setversion_out; } - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); setversion_out: mnt_drop_write_file(filp); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8a36696..e6a2b40 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -570,7 +570,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } dir->i_size += paste_size; - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); if (!S_ISDIR(inode->i_mode) && visible) /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); @@ -963,7 +963,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir); reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) @@ -1067,11 +1067,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inc_nlink(inode); goto end_unlink; } - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); reiserfs_update_sd(&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + dir->i_ctime = dir->i_mtime = current_time(dir); reiserfs_update_sd(&th, dir); if (!savelink) @@ -1246,7 +1246,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, return err ? err : retval; } - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); reiserfs_update_sd(&th, inode); ihold(inode); @@ -1306,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * get_empty_nodes or its clones */ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { int retval; INITIALIZE_PATH(old_entry_path); @@ -1321,6 +1322,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned long savelink = 1; struct timespec ctime; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + /* * three balancings: (1) old name removal, (2) new name insertion * and (3) maybe "save" link insertion @@ -1567,7 +1571,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_de_hidden(old_de.de_deh + old_de.de_entry_num); journal_mark_dirty(&th, old_de.de_bh); - ctime = CURRENT_TIME_SEC; + ctime = current_time(old_dir); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; /* @@ -1650,10 +1654,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .mknod = reiserfs_mknod, .rename = reiserfs_rename, .setattr = reiserfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, @@ -1667,10 +1668,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = page_get_link, .setattr = reiserfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = generic_removexattr, .permission = reiserfs_permission, }; @@ -1679,10 +1677,7 @@ const struct inode_operations reiserfs_symlink_inode_operations = { */ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 4032d1e..a97e352 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1987,8 +1987,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, pathrelse(&s_search_path); if (update_timestamps) { - inode->i_mtime = CURRENT_TIME_SEC; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = current_time(inode); + inode->i_ctime = current_time(inode); } reiserfs_update_sd(th, inode); @@ -2012,8 +2012,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, update_and_out: if (update_timestamps) { /* this is truncate, not file closing */ - inode->i_mtime = CURRENT_TIME_SEC; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = current_time(inode); + inode->i_ctime = current_time(inode); } reiserfs_update_sd(th, inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7a4a85a..0a6ad4e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s, static int reiserfs_quota_on_mount(struct super_block *, int); #endif -/* look for uncompleted unlinks and truncates and complete them */ +/* + * Look for uncompleted unlinks and truncates and complete them + * + * Called with superblock write locked. If quotas are enabled, we have to + * release/retake lest we call dquot_quota_on_mount(), proceed to + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per + * cpu worklets to complete flush_async_commits() that in turn wait for the + * superblock write lock. + */ static int finish_unfinished(struct super_block *s) { INITIALIZE_PATH(path); @@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s) quota_enabled[i] = 0; continue; } + reiserfs_write_unlock(s); ret = reiserfs_quota_on_mount(s, i); + reiserfs_write_lock(s); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " @@ -2512,7 +2522,7 @@ out: if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index a33812a..e87aa21 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -450,13 +450,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { - struct timespec now = current_fs_time(inode->i_sb); + struct timespec now = current_time(inode); if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); } @@ -575,7 +575,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, new_size = buffer_size + sizeof(struct reiserfs_xattr_header); if (!err && new_size < i_size_read(d_inode(dentry))) { struct iattr newattrs = { - .ia_ctime = current_fs_time(inode->i_sb), + .ia_ctime = current_time(inode), .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index dbed42f..3d2256a 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -242,13 +242,9 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - if (error == 0) - acl = NULL; - } } break; case ACL_TYPE_DEFAULT: @@ -277,7 +273,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, if (error == -ENODATA) { error = 0; if (type == ACL_TYPE_ACCESS) { - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); } } diff --git a/fs/select.c b/fs/select.c index 8ed9da5..3d4f85d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -29,6 +29,7 @@ #include <linux/sched/rt.h> #include <linux/freezer.h> #include <net/busy_poll.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> @@ -554,7 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set_bits fds; void *bits; int ret, max_fds; - unsigned int size; + size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -581,7 +582,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + if (size > (SIZE_MAX / 6)) + goto out_nofds; + + alloc_size = 6 * size; + bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); + if (!bits && alloc_size > PAGE_SIZE) + bits = vmalloc(alloc_size); + if (!bits) goto out_nofds; } @@ -618,7 +626,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, out: if (bits != stack_fds) - kfree(bits); + kvfree(bits); out_nofds: return ret; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 6dc4296..368bfb9 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -679,11 +679,11 @@ EXPORT_SYMBOL(seq_puts); /* * A helper routine for putting decimal numbers without rich format of printf(). * only 'unsigned long long' is supported. - * This routine will put one byte delimiter + number into seq_file. + * This routine will put strlen(delimiter) + number into seq_file. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ -void seq_put_decimal_ull(struct seq_file *m, char delimiter, +void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { int len; @@ -691,8 +691,15 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter, if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; - if (delimiter) - m->buf[m->count++] = delimiter; + len = strlen(delimiter); + if (m->count + len >= m->size) + goto overflow; + + memcpy(m->buf + m->count, delimiter, len); + m->count += len; + + if (m->count + 1 >= m->size) + goto overflow; if (num < 10) { m->buf[m->count++] = num + '0'; @@ -702,6 +709,7 @@ void seq_put_decimal_ull(struct seq_file *m, char delimiter, len = num_to_str(m->buf + m->count, m->size - m->count, num); if (!len) goto overflow; + m->count += len; return; @@ -710,19 +718,42 @@ overflow: } EXPORT_SYMBOL(seq_put_decimal_ull); -void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num) +void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { + int len; + + if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ + goto overflow; + + len = strlen(delimiter); + if (m->count + len >= m->size) + goto overflow; + + memcpy(m->buf + m->count, delimiter, len); + m->count += len; + + if (m->count + 2 >= m->size) + goto overflow; + if (num < 0) { - if (m->count + 3 >= m->size) { - seq_set_overflow(m); - return; - } - if (delimiter) - m->buf[m->count++] = delimiter; + m->buf[m->count++] = '-'; num = -num; - delimiter = '-'; } - seq_put_decimal_ull(m, delimiter, num); + + if (num < 10) { + m->buf[m->count++] = num + '0'; + return; + } + + len = num_to_str(m->buf + m->count, m->size - m->count, num); + if (!len) + goto overflow; + + m->count += len; + return; + +overflow: + seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); diff --git a/fs/splice.c b/fs/splice.c index dd9bf7e..153d4f3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -183,82 +183,39 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; - int ret, do_wakeup, page_nr; + int ret = 0, page_nr = 0; if (!spd_pages) return 0; - ret = 0; - do_wakeup = 0; - page_nr = 0; - - pipe_lock(pipe); - - for (;;) { - if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; - break; - } - - if (pipe->nrbufs < pipe->buffers) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - struct pipe_buffer *buf = pipe->bufs + newbuf; - - buf->page = spd->pages[page_nr]; - buf->offset = spd->partial[page_nr].offset; - buf->len = spd->partial[page_nr].len; - buf->private = spd->partial[page_nr].private; - buf->ops = spd->ops; - if (spd->flags & SPLICE_F_GIFT) - buf->flags |= PIPE_BUF_FLAG_GIFT; - - pipe->nrbufs++; - page_nr++; - ret += buf->len; - - if (pipe->files) - do_wakeup = 1; + if (unlikely(!pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + goto out; + } - if (!--spd->nr_pages) - break; - if (pipe->nrbufs < pipe->buffers) - continue; + while (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; - break; - } + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->private = spd->partial[page_nr].private; + buf->ops = spd->ops; - if (spd->flags & SPLICE_F_NONBLOCK) { - if (!ret) - ret = -EAGAIN; - break; - } + pipe->nrbufs++; + page_nr++; + ret += buf->len; - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; + if (!--spd->nr_pages) break; - } - - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible_sync(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - do_wakeup = 0; - } - - pipe->waiting_writers++; - pipe_wait(pipe); - pipe->waiting_writers--; } - pipe_unlock(pipe); - - if (do_wakeup) - wakeup_pipe_readers(pipe); + if (!ret) + ret = -EAGAIN; +out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); @@ -266,6 +223,26 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } EXPORT_SYMBOL_GPL(splice_to_pipe); +ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +{ + int ret; + + if (unlikely(!pipe->readers)) { + send_sig(SIGPIPE, current, 0); + ret = -EPIPE; + } else if (pipe->nrbufs == pipe->buffers) { + ret = -EAGAIN; + } else { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + pipe->bufs[newbuf] = *buf; + pipe->nrbufs++; + return buf->len; + } + pipe_buf_release(pipe, buf); + return ret; +} +EXPORT_SYMBOL(add_to_pipe); + void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { put_page(spd->pages[i]); @@ -303,207 +280,6 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } -static int -__generic_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct address_space *mapping = in->f_mapping; - unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct page *page; - pgoff_t index, end_index; - loff_t isize; - int error, page_nr; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &page_cache_pipe_buf_ops, - .spd_release = spd_release_page, - }; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - index = *ppos >> PAGE_SHIFT; - loff = *ppos & ~PAGE_MASK; - req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = min(req_pages, spd.nr_pages_max); - - /* - * Lookup the (hopefully) full range of pages we need. - */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); - index += spd.nr_pages; - - /* - * If find_get_pages_contig() returned fewer pages than we needed, - * readahead/allocate the rest and fill in the holes. - */ - if (spd.nr_pages < nr_pages) - page_cache_sync_readahead(mapping, &in->f_ra, in, - index, req_pages - spd.nr_pages); - - error = 0; - while (spd.nr_pages < nr_pages) { - /* - * Page could be there, find_get_pages_contig() breaks on - * the first hole. - */ - page = find_get_page(mapping, index); - if (!page) { - /* - * page didn't exist, allocate one. - */ - page = page_cache_alloc_cold(mapping); - if (!page) - break; - - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (unlikely(error)) { - put_page(page); - if (error == -EEXIST) - continue; - break; - } - /* - * add_to_page_cache() locks the page, unlock it - * to avoid convoluting the logic below even more. - */ - unlock_page(page); - } - - spd.pages[spd.nr_pages++] = page; - index++; - } - - /* - * Now loop over the map and see if we need to start IO on any - * pages, fill in the partial map, etc. - */ - index = *ppos >> PAGE_SHIFT; - nr_pages = spd.nr_pages; - spd.nr_pages = 0; - for (page_nr = 0; page_nr < nr_pages; page_nr++) { - unsigned int this_len; - - if (!len) - break; - - /* - * this_len is the max we'll use from this page - */ - this_len = min_t(unsigned long, len, PAGE_SIZE - loff); - page = spd.pages[page_nr]; - - if (PageReadahead(page)) - page_cache_async_readahead(mapping, &in->f_ra, in, - page, index, req_pages - page_nr); - - /* - * If the page isn't uptodate, we may need to start io on it - */ - if (!PageUptodate(page)) { - lock_page(page); - - /* - * Page was truncated, or invalidated by the - * filesystem. Redo the find/create, but this time the - * page is kept locked, so there's no chance of another - * race with truncate/invalidate. - */ - if (!page->mapping) { - unlock_page(page); -retry_lookup: - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - - if (!page) { - error = -ENOMEM; - break; - } - put_page(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - /* - * page was already under io and is now done, great - */ - if (PageUptodate(page)) { - unlock_page(page); - goto fill_it; - } - - /* - * need to read in the page - */ - error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - /* - * Re-lookup the page - */ - if (error == AOP_TRUNCATED_PAGE) - goto retry_lookup; - - break; - } - } -fill_it: - /* - * i_size must be checked after PageUptodate. - */ - isize = i_size_read(mapping->host); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) - break; - - /* - * if this is the last page, see if we need to shrink - * the length and stop - */ - if (end_index == index) { - unsigned int plen; - - /* - * max good bytes in this page - */ - plen = ((isize - 1) & ~PAGE_MASK) + 1; - if (plen <= loff) - break; - - /* - * force quit after adding this page - */ - this_len = min(this_len, plen - loff); - len = this_len; - } - - spd.partial[page_nr].offset = loff; - spd.partial[page_nr].len = this_len; - len -= this_len; - loff = 0; - spd.nr_pages++; - index++; - } - - /* - * Release any pages at the end, if we quit early. 'page_nr' is how far - * we got, 'nr_pages' is how many pages are in the map. - */ - while (page_nr < nr_pages) - put_page(spd.pages[page_nr++]); - in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; - - if (spd.nr_pages) - error = splice_to_pipe(pipe, &spd); - - splice_shrink_spd(&spd); - return error; -} - /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from @@ -514,39 +290,47 @@ fill_it: * * Description: * Will read pages from given file and fill them into a pipe. Can be - * used as long as the address_space operations for the source implements - * a readpage() hook. + * used as long as it has more or less sane ->read_iter(). * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - loff_t isize, left; - int ret; - - if (IS_DAX(in->f_mapping->host)) - return default_file_splice_read(in, ppos, pipe, len, flags); + struct iov_iter to; + struct kiocb kiocb; + loff_t isize; + int idx, ret; isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; - left = isize - *ppos; - if (unlikely(left < len)) - len = left; - - ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); + idx = to.idx; + init_sync_kiocb(&kiocb, in); + kiocb.ki_pos = *ppos; + ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { - *ppos += ret; + *ppos = kiocb.ki_pos; file_accessed(in); + } else if (ret < 0) { + to.idx = idx; + to.iov_offset = 0; + iov_iter_advance(&to, 0); /* to free what was emitted */ + /* + * callers of ->splice_read() expect -EAGAIN on + * "can't put anything in there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; } return ret; } EXPORT_SYMBOL(generic_file_splice_read); -static const struct pipe_buf_operations default_pipe_buf_ops = { +const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, @@ -570,7 +354,7 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); -static ssize_t kernel_readv(struct file *file, const struct iovec *vec, +static ssize_t kernel_readv(struct file *file, const struct kvec *vec, unsigned long vlen, loff_t offset) { mm_segment_t old_fs; @@ -602,102 +386,70 @@ ssize_t kernel_write(struct file *file, const char *buf, size_t count, } EXPORT_SYMBOL(kernel_write); -ssize_t default_file_splice_read(struct file *in, loff_t *ppos, +static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; + struct iov_iter to; + struct page **pages; unsigned int nr_pages; - unsigned int nr_freed; - size_t offset; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; + size_t offset, dummy, copied = 0; ssize_t res; - size_t this_len; - int error; int i; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &default_pipe_buf_ops, - .spd_release = spd_release_page, - }; - if (splice_grow_spd(pipe, &spd)) + if (pipe->nrbufs == pipe->buffers) + return -EAGAIN; + + /* + * Try to keep page boundaries matching to source pagecache ones - + * it probably won't be much help, but... + */ + offset = *ppos & ~PAGE_MASK; + + iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); + + res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); + if (res <= 0) return -ENOMEM; - res = -ENOMEM; + nr_pages = res / PAGE_SIZE; + vec = __vec; - if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { - vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); - if (!vec) - goto shrink_ret; + if (nr_pages > PIPE_DEF_BUFFERS) { + vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL); + if (unlikely(!vec)) { + res = -ENOMEM; + goto out; + } } - offset = *ppos & ~PAGE_MASK; - nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - - for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { - struct page *page; + pipe->bufs[to.idx].offset = offset; + pipe->bufs[to.idx].len -= offset; - page = alloc_page(GFP_USER); - error = -ENOMEM; - if (!page) - goto err; - - this_len = min_t(size_t, len, PAGE_SIZE - offset); - vec[i].iov_base = (void __user *) page_address(page); + for (i = 0; i < nr_pages; i++) { + size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); + vec[i].iov_base = page_address(pages[i]) + offset; vec[i].iov_len = this_len; - spd.pages[i] = page; - spd.nr_pages++; len -= this_len; offset = 0; } - res = kernel_readv(in, vec, spd.nr_pages, *ppos); - if (res < 0) { - error = res; - goto err; - } - - error = 0; - if (!res) - goto err; - - nr_freed = 0; - for (i = 0; i < spd.nr_pages; i++) { - this_len = min_t(size_t, vec[i].iov_len, res); - spd.partial[i].offset = 0; - spd.partial[i].len = this_len; - if (!this_len) { - __free_page(spd.pages[i]); - spd.pages[i] = NULL; - nr_freed++; - } - res -= this_len; - } - spd.nr_pages -= nr_freed; - - res = splice_to_pipe(pipe, &spd); - if (res > 0) + res = kernel_readv(in, vec, nr_pages, *ppos); + if (res > 0) { + copied = res; *ppos += res; + } -shrink_ret: if (vec != __vec) kfree(vec); - splice_shrink_spd(&spd); +out: + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + kvfree(pages); + iov_iter_advance(&to, copied); /* truncates and discards */ return res; - -err: - for (i = 0; i < spd.nr_pages; i++) - __free_page(spd.pages[i]); - - res = error; - goto shrink_ret; } -EXPORT_SYMBOL(default_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' @@ -757,13 +509,12 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des while (pipe->nrbufs) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; - const struct pipe_buf_operations *ops = buf->ops; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; - ret = buf->ops->confirm(pipe, buf); + ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; @@ -783,8 +534,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des sd->total_len -= ret; if (!buf->len) { - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) @@ -1003,7 +753,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (idx == pipe->buffers - 1) idx = -1; - ret = buf->ops->confirm(pipe, buf); + ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; @@ -1030,11 +780,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (ret) { struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; if (ret >= buf->len) { - const struct pipe_buf_operations *ops = buf->ops; ret -= buf->len; buf->len = 0; - buf->ops = NULL; - ops->release(pipe, buf); + pipe_buf_release(pipe, buf); pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->files) @@ -1273,10 +1021,8 @@ out_release: for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; - if (buf->ops) { - buf->ops->release(pipe, buf); - buf->ops = NULL; - } + if (buf->ops) + pipe_buf_release(pipe, buf); } if (!bytes) @@ -1342,6 +1088,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, } EXPORT_SYMBOL(do_splice_direct); +static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) +{ + while (pipe->nrbufs == pipe->buffers) { + if (flags & SPLICE_F_NONBLOCK) + return -EAGAIN; + if (signal_pending(current)) + return -ERESTARTSYS; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + return 0; +} + static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); @@ -1424,8 +1184,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, offset = in->f_pos; } - ret = do_splice_to(in, &offset, opipe, len, flags); - + pipe_lock(opipe); + ret = wait_for_space(opipe, flags); + if (!ret) + ret = do_splice_to(in, &offset, opipe, len, flags); + pipe_unlock(opipe); + if (ret > 0) + wakeup_pipe_readers(opipe); if (!off_in) in->f_pos = offset; else if (copy_to_user(off_in, &offset, sizeof(loff_t))) @@ -1437,106 +1202,50 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } -/* - * Map an iov into an array of pages and offset/length tupples. With the - * partial_page structure, we can map several non-contiguous ranges into - * our ones pages[] map instead of splitting that operation into pieces. - * Could easily be exported as a generic helper for other users, in which - * case one would probably want to add a 'max_nr_pages' parameter as well. - */ -static int get_iovec_page_array(const struct iovec __user *iov, - unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, bool aligned, - unsigned int pipe_buffers) +static int iter_to_pipe(struct iov_iter *from, + struct pipe_inode_info *pipe, + unsigned flags) { - int buffers = 0, error = 0; - - while (nr_vecs) { - unsigned long off, npages; - struct iovec entry; - void __user *base; - size_t len; - int i; - - error = -EFAULT; - if (copy_from_user(&entry, iov, sizeof(entry))) - break; - - base = entry.iov_base; - len = entry.iov_len; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - error = 0; - if (unlikely(!len)) - break; - error = -EFAULT; - if (!access_ok(VERIFY_READ, base, len)) - break; - - /* - * Get this base offset and number of pages, then map - * in the user pages. - */ - off = (unsigned long) base & ~PAGE_MASK; - - /* - * If asked for alignment, the offset must be zero and the - * length a multiple of the PAGE_SIZE. - */ - error = -EINVAL; - if (aligned && (off || len & ~PAGE_MASK)) - break; - - npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > pipe_buffers - buffers) - npages = pipe_buffers - buffers; - - error = get_user_pages_fast((unsigned long)base, npages, - 0, &pages[buffers]); - - if (unlikely(error <= 0)) + struct pipe_buffer buf = { + .ops = &user_page_pipe_buf_ops, + .flags = flags + }; + size_t total = 0; + int ret = 0; + bool failed = false; + + while (iov_iter_count(from) && !failed) { + struct page *pages[16]; + ssize_t copied; + size_t start; + int n; + + copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start); + if (copied <= 0) { + ret = copied; break; - - /* - * Fill this contiguous range into the partial page map. - */ - for (i = 0; i < error; i++) { - const int plen = min_t(size_t, len, PAGE_SIZE - off); - - partial[buffers].offset = off; - partial[buffers].len = plen; - - off = 0; - len -= plen; - buffers++; } - /* - * We didn't complete this iov, stop here since it probably - * means we have to move some of this into a pipe to - * be able to continue. - */ - if (len) - break; - - /* - * Don't continue if we mapped fewer pages than we asked for, - * or if we mapped the max number of pages that we have - * room for. - */ - if (error < npages || buffers == pipe_buffers) - break; - - nr_vecs--; - iov++; + for (n = 0; copied; n++, start = 0) { + int size = min_t(int, copied, PAGE_SIZE - start); + if (!failed) { + buf.page = pages[n]; + buf.offset = start; + buf.len = size; + ret = add_to_pipe(pipe, &buf); + if (unlikely(ret < 0)) { + failed = true; + } else { + iov_iter_advance(from, ret); + total += ret; + } + } else { + put_page(pages[n]); + } + copied -= size; + } } - - if (buffers) - return buffers; - - return error; + return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, @@ -1590,38 +1299,36 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &user_page_pipe_buf_ops, - .spd_release = spd_release_page, - }; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter from; long ret; + unsigned buf_flag = 0; + + if (flags & SPLICE_F_GIFT) + buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, - spd.partial, false, - spd.nr_pages_max); - if (spd.nr_pages <= 0) - ret = spd.nr_pages; - else - ret = splice_to_pipe(pipe, &spd); + ret = import_iovec(WRITE, uiov, nr_segs, + ARRAY_SIZE(iovstack), &iov, &from); + if (ret < 0) + return ret; - splice_shrink_spd(&spd); + pipe_lock(pipe); + ret = wait_for_space(pipe, flags); + if (!ret) + ret = iter_to_pipe(&from, pipe, buf_flag); + pipe_unlock(pipe); + if (ret > 0) + wakeup_pipe_readers(pipe); + kfree(iov); return ret; } @@ -1876,7 +1583,7 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - ibuf->ops->get(ipipe, ibuf); + pipe_buf_get(ipipe, ibuf); *obuf = *ibuf; /* @@ -1948,7 +1655,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - ibuf->ops->get(ipipe, ibuf); + pipe_buf_get(ipipe, ibuf); obuf = opipe->bufs + nbuf; *obuf = *ibuf; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 0927b1e..e9793b1 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -425,7 +425,6 @@ failed_read: const struct inode_operations squashfs_inode_ops = { - .getxattr = generic_getxattr, .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 67cad77..40c10d9 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -247,6 +247,5 @@ failed: const struct inode_operations squashfs_dir_inode_ops = { .lookup = squashfs_lookup, - .getxattr = generic_getxattr, .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index d688ef4..79b9c31 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -120,7 +120,6 @@ const struct address_space_operations squashfs_symlink_aops = { const struct inode_operations squashfs_symlink_inode_ops = { .readlink = generic_readlink, .get_link = page_get_link, - .getxattr = generic_getxattr, .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index c83f5d9..afe70f8 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -42,6 +42,5 @@ static inline int squashfs_xattr_lookup(struct super_block *sb, return 0; } #define squashfs_listxattr NULL -#define generic_getxattr NULL #define squashfs_xattr_handlers NULL #endif @@ -1269,25 +1269,34 @@ EXPORT_SYMBOL(__sb_start_write); static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); - /* - * We are going to return to userspace and forget about this lock, the - * ownership goes to the caller of thaw_super() which does unlock. - * - * FIXME: we should do this before return from freeze_super() after we - * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() - * should re-acquire these locks before s_op->unfreeze_fs(sb). However - * this leads to lockdep false-positives, so currently we do the early - * release right after acquire. - */ - percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +/* + * We are going to return to userspace and forget about these locks, the + * ownership goes to the caller of thaw_super() which does unlock(). + */ +static void lockdep_sb_freeze_release(struct super_block *sb) +{ + int level; + + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +/* + * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). + */ +static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); @@ -1379,10 +1388,11 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return 0; } @@ -1399,7 +1409,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } @@ -1409,11 +1419,14 @@ int thaw_super(struct super_block *sb) goto out; } + lockdep_sb_freeze_acquire(sb); + if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return error; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 94374e4..2b67bda 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *buf, *path = NULL; + char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) - path = kernfs_path(parent, buf, PATH_MAX); + kernfs_path(parent, buf, PATH_MAX); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - path, name); + buf, name); kfree(buf); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index dc1358b..ac2de0e 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -233,8 +233,8 @@ void sysfs_remove_group(struct kobject *kobj, kn = kernfs_find_and_get(parent, grp->name); if (!kn) { WARN(!kn, KERN_WARNING - "sysfs group %p not found for kobject '%s'\n", - grp, kobject_name(kobj)); + "sysfs group '%s' not found for kobject '%s'\n", + grp->name, kobject_name(kobj)); return; } } else { diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 2661b77..5bdae85 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -215,7 +215,7 @@ got_it: memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); out_page: dir_put_page(page); @@ -239,7 +239,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) de->inode = 0; err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); dir_put_page(page); - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); return err; } @@ -337,7 +337,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page *page, de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); err = dir_commit_chunk(page, pos, SYSV_DIRSIZE); dir_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); } diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 82ddc09..7ba997e 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -33,7 +33,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index f9db4eb..53f1b789 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -164,7 +164,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) dirty_sb(sb); inode_init_owner(inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 2fde40a..08d3e63 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -178,7 +178,7 @@ static inline int splice_branch(struct inode *inode, *where->p = where->key; write_unlock(&pointers_lock); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -418,7 +418,7 @@ do_indirects: } n++; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) sysv_sync_inode (inode); else diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index a42de45c..d8817f1 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -120,7 +120,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); @@ -206,7 +206,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * higher-level routines. */ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry) + struct inode * new_dir, struct dentry * new_dentry, + unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); @@ -216,6 +217,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct sysv_dir_entry * old_de; int err = -ENOENT; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_de = sysv_find_entry(old_dentry, &old_page); if (!old_de) goto out; @@ -240,7 +244,7 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, if (!new_de) goto out_dir; sysv_set_link(new_de, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index ad40b64..21d36d2 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -133,7 +133,7 @@ static struct inode *tracefs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); } return inode; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 4b86d3a..bd4a5e8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -301,6 +301,95 @@ out_budg: return err; } +static int do_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode, struct inode **whiteout) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; + struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + int err, instantiated = 0; + + /* + * Budget request settings: new dirty inode, new direntry, + * budget for dirtied inode will be released via writeback. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + err = ubifs_budget_space(c, &ino_req); + if (err) { + ubifs_release_budget(c, &req); + return err; + } + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + ui = ubifs_inode(inode); + + if (whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(inode->i_op == &ubifs_file_inode_operations); + } + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + mutex_lock(&ui->ui_mutex); + insert_inode_hash(inode); + + if (whiteout) { + mark_inode_dirty(inode); + drop_nlink(inode); + *whiteout = inode; + } else { + d_tmpfile(dentry, inode); + } + ubifs_assert(ui->dirty); + + instantiated = 1; + mutex_unlock(&ui->ui_mutex); + + mutex_lock(&dir_ui->ui_mutex); + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + + return 0; + +out_cancel: + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + if (!instantiated) + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + if (!instantiated) + ubifs_release_budget(c, &ino_req); + ubifs_err(c, "cannot create temporary file, error %d", err); + return err; +} + +static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + return do_tmpfile(dir, dentry, mode, NULL); +} + /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -350,7 +439,7 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err; + int err = 0; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; @@ -452,14 +541,12 @@ out: kfree(file->private_data); file->private_data = NULL; - if (err != -ENOENT) { + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); - return err; - } /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; - return 0; + return err; } /* Free saved readdir() state when the directory is closed */ @@ -927,37 +1014,43 @@ out_budg: } /** - * lock_3_inodes - a wrapper for locking three UBIFS inodes. + * lock_4_inodes - a wrapper for locking three UBIFS inodes. * @inode1: first inode * @inode2: second inode * @inode3: third inode + * @inode4: fouth inode * * This function is used for 'ubifs_rename()' and @inode1 may be the same as - * @inode2 whereas @inode3 may be %NULL. + * @inode2 whereas @inode3 and @inode4 may be %NULL. * * We do not implement any tricks to guarantee strict lock ordering, because * VFS has already done it for us on the @i_mutex. So this is just a simple * wrapper function. */ -static void lock_3_inodes(struct inode *inode1, struct inode *inode2, - struct inode *inode3) +static void lock_4_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3, struct inode *inode4) { mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); if (inode2 != inode1) mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); if (inode3) mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); + if (inode4) + mutex_lock_nested(&ubifs_inode(inode4)->ui_mutex, WB_MUTEX_4); } /** - * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. + * unlock_4_inodes - a wrapper for unlocking three UBIFS inodes for rename. * @inode1: first inode * @inode2: second inode * @inode3: third inode + * @inode4: fouth inode */ -static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, - struct inode *inode3) +static void unlock_4_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3, struct inode *inode4) { + if (inode4) + mutex_unlock(&ubifs_inode(inode4)->ui_mutex); if (inode3) mutex_unlock(&ubifs_inode(inode3)->ui_mutex); if (inode1 != inode2) @@ -965,13 +1058,16 @@ static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, mutex_unlock(&ubifs_inode(inode1)->ui_mutex); } -static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int do_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); + struct ubifs_inode *whiteout_ui = NULL; int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); int unlink = !!new_inode; @@ -984,6 +1080,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct timespec time; unsigned int uninitialized_var(saved_nlink); + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. @@ -993,15 +1092,13 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, * separately. */ - dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", + dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", old_dentry, old_inode->i_ino, old_dir->i_ino, - new_dentry, new_dir->i_ino); - ubifs_assert(inode_is_locked(old_dir)); - ubifs_assert(inode_is_locked(new_dir)); + new_dentry, new_dir->i_ino, flags); + if (unlink) ubifs_assert(inode_is_locked(new_inode)); - if (unlink && is_dir) { err = check_dir_empty(c, new_inode); if (err) @@ -1017,7 +1114,32 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, return err; } - lock_3_inodes(old_dir, new_dir, new_inode); + if (flags & RENAME_WHITEOUT) { + union ubifs_dev_desc *dev = NULL; + + dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!dev) { + ubifs_release_budget(c, &req); + ubifs_release_budget(c, &ino_req); + return -ENOMEM; + } + + err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); + if (err) { + ubifs_release_budget(c, &req); + ubifs_release_budget(c, &ino_req); + kfree(dev); + return err; + } + + whiteout->i_state |= I_LINKABLE; + whiteout_ui = ubifs_inode(whiteout); + whiteout_ui->data = dev; + whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); + ubifs_assert(!whiteout_ui->dirty); + } + + lock_4_inodes(old_dir, new_dir, new_inode, whiteout); /* * Like most other Unix systems, set the @i_ctime for inodes on a @@ -1087,12 +1209,34 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink && IS_SYNC(new_inode)) sync = 1; } - err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, + + if (whiteout) { + struct ubifs_budget_req wht_req = { .dirtied_ino = 1, + .dirtied_ino_d = \ + ALIGN(ubifs_inode(whiteout)->data_len, 8) }; + + err = ubifs_budget_space(c, &wht_req); + if (err) { + ubifs_release_budget(c, &req); + ubifs_release_budget(c, &ino_req); + kfree(whiteout_ui->data); + whiteout_ui->data_len = 0; + iput(whiteout); + return err; + } + + inc_nlink(whiteout); + mark_inode_dirty(whiteout); + whiteout->i_state &= ~I_LINKABLE; + iput(whiteout); + } + + err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout, sync); if (err) goto out_cancel; - unlock_3_inodes(old_dir, new_dir, new_inode); + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); mutex_lock(&old_inode_ui->ui_mutex); @@ -1125,12 +1269,74 @@ out_cancel: inc_nlink(old_dir); } } - unlock_3_inodes(old_dir, new_dir, new_inode); + if (whiteout) { + drop_nlink(whiteout); + iput(whiteout); + } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); return err; } +static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ubifs_info *c = old_dir->i_sb->s_fs_info; + struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, + .dirtied_ino = 2 }; + int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); + struct inode *fst_inode = d_inode(old_dentry); + struct inode *snd_inode = d_inode(new_dentry); + struct timespec time; + int err; + + ubifs_assert(fst_inode && snd_inode); + + lock_4_inodes(old_dir, new_dir, NULL, NULL); + + time = ubifs_current_time(old_dir); + fst_inode->i_ctime = time; + snd_inode->i_ctime = time; + old_dir->i_mtime = old_dir->i_ctime = time; + new_dir->i_mtime = new_dir->i_ctime = time; + + if (old_dir != new_dir) { + if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) { + inc_nlink(new_dir); + drop_nlink(old_dir); + } + else if (!S_ISDIR(fst_inode->i_mode) && S_ISDIR(snd_inode->i_mode)) { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + + err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry, + sync); + + unlock_4_inodes(old_dir, new_dir, NULL, NULL); + ubifs_release_budget(c, &req); + + return err; +} + +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_WHITEOUT | RENAME_EXCHANGE)) + return -EINVAL; + + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); + + if (flags & RENAME_EXCHANGE) + return ubifs_xrename(old_dir, old_dentry, new_dir, new_dentry); + + return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); +} + int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1182,13 +1388,11 @@ const struct inode_operations ubifs_dir_inode_operations = { .rename = ubifs_rename, .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif + .tmpfile = ubifs_tmpfile, }; const struct file_operations ubifs_dir_operations = { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7bbf420..b4fbeef 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1262,7 +1262,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = inode_change_ok(inode, attr); + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1397,7 +1397,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, #endif /** - * update_ctime - update mtime and ctime of an inode. + * update_mctime - update mtime and ctime of an inode. * @inode: inode to update * * This function updates mtime and ctime of the inode if it is not equivalent to @@ -1621,10 +1621,7 @@ const struct address_space_operations ubifs_file_address_operations = { const struct inode_operations ubifs_file_inode_operations = { .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif @@ -1635,10 +1632,7 @@ const struct inode_operations ubifs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, .listxattr = ubifs_listxattr, - .removexattr = generic_removexattr, #ifdef CONFIG_UBIFS_ATIME_SUPPORT .update_time = ubifs_update_time, #endif diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 821b348..e845c64 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -113,7 +113,7 @@ static int switch_gc_head(struct ubifs_info *c) * data_nodes_cmp - compare 2 data nodes. * @priv: UBIFS file-system description object * @a: first data node - * @a: second data node + * @b: second data node * * This function compares data nodes @a and @b. Returns %1 if @a has greater * inode or block number, and %-1 otherwise. diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 0b9da5b..91bc76dc 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -908,6 +908,147 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) } /** + * ubifs_jnl_xrename - cross rename two directory entries. + * @c: UBIFS file-system description object + * @fst_dir: parent inode of 1st directory entry to exchange + * @fst_dentry: 1st directory entry to exchange + * @snd_dir: parent inode of 2nd directory entry to exchange + * @snd_dentry: 2nd directory entry to exchange + * @sync: non-zero if the write-buffer has to be synchronized + * + * This function implements the cross rename operation which may involve + * writing 2 inodes and 2 directory entries. It marks the written inodes as clean + * and returns zero on success. In case of failure, a negative error code is + * returned. + */ +int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, + const struct dentry *fst_dentry, + const struct inode *snd_dir, + const struct dentry *snd_dentry, int sync) +{ + union ubifs_key key; + struct ubifs_dent_node *dent1, *dent2; + int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ; + int aligned_dlen1, aligned_dlen2; + int twoparents = (fst_dir != snd_dir); + const struct inode *fst_inode = d_inode(fst_dentry); + const struct inode *snd_inode = d_inode(snd_dentry); + void *p; + + dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu", + fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino); + + ubifs_assert(ubifs_inode(fst_dir)->data_len == 0); + ubifs_assert(ubifs_inode(snd_dir)->data_len == 0); + ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex)); + ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex)); + + dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1; + dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1; + aligned_dlen1 = ALIGN(dlen1, 8); + aligned_dlen2 = ALIGN(dlen2, 8); + + len = aligned_dlen1 + aligned_dlen2 + ALIGN(plen, 8); + if (twoparents) + len += plen; + + dent1 = kmalloc(len, GFP_NOFS); + if (!dent1) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + /* Make new dent for 1st entry */ + dent1->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name); + dent1->inum = cpu_to_le64(fst_inode->i_ino); + dent1->type = get_dent_type(fst_inode->i_mode); + dent1->nlen = cpu_to_le16(snd_dentry->d_name.len); + memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len); + dent1->name[snd_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent1); + ubifs_prep_grp_node(c, dent1, dlen1, 0); + + /* Make new dent for 2nd entry */ + dent2 = (void *)dent1 + aligned_dlen1; + dent2->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name); + dent2->inum = cpu_to_le64(snd_inode->i_ino); + dent2->type = get_dent_type(snd_inode->i_mode); + dent2->nlen = cpu_to_le16(fst_dentry->d_name.len); + memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len); + dent2->name[fst_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent2); + ubifs_prep_grp_node(c, dent2, dlen2, 0); + + p = (void *)dent2 + aligned_dlen2; + if (!twoparents) + pack_inode(c, p, fst_dir, 1); + else { + pack_inode(c, p, fst_dir, 0); + p += ALIGN(plen, 8); + pack_inode(c, p, snd_dir, 1); + } + + err = write_head(c, BASEHD, dent1, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, fst_dir->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, snd_dir->i_ino); + } + release_head(c, BASEHD); + + dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen1; + dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen2; + + ino_key_init(c, &key, fst_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + + if (twoparents) { + offs += ALIGN(plen, 8); + ino_key_init(c, &key, snd_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + } + + finish_reservation(c); + + mark_inode_clean(c, ubifs_inode(fst_dir)); + if (twoparents) + mark_inode_clean(c, ubifs_inode(snd_dir)); + kfree(dent1); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(dent1); + return err; +} + +/** * ubifs_jnl_rename - rename a directory entry. * @c: UBIFS file-system description object * @old_dir: parent inode of directory entry to rename @@ -917,14 +1058,15 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 3 inodes and 2 directory entries. It marks the written inodes as clean + * to 4 inodes and 2 directory entries. It marks the written inodes as clean * and returns zero on success. In case of failure, a negative error code is * returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, - const struct dentry *new_dentry, int sync) + const struct dentry *new_dentry, + const struct inode *whiteout, int sync) { void *p; union ubifs_key key; @@ -958,7 +1100,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); - if (old_dir != new_dir) + if (move) len += plen; dent = kmalloc(len, GFP_NOFS); if (!dent) @@ -980,13 +1122,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen1, 0); - /* Make deletion dent */ dent2 = (void *)dent + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; dent_key_init_flash(c, &dent2->key, old_dir->i_ino, &old_dentry->d_name); - dent2->inum = 0; - dent2->type = DT_UNKNOWN; + + if (whiteout) { + dent2->inum = cpu_to_le64(whiteout->i_ino); + dent2->type = get_dent_type(whiteout->i_mode); + } else { + /* Make deletion dent */ + dent2->inum = 0; + dent2->type = DT_UNKNOWN; + } dent2->nlen = cpu_to_le16(old_dentry->d_name.len); memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); dent2->name[old_dentry->d_name.len] = '\0'; @@ -1035,16 +1183,26 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (err) goto out_ro; - err = ubifs_add_dirt(c, lnum, dlen2); - if (err) - goto out_ro; + offs += aligned_dlen1; + if (whiteout) { + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name); + if (err) + goto out_ro; - dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); - err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); - if (err) - goto out_ro; + ubifs_delete_orphan(c, whiteout->i_ino); + } else { + err = ubifs_add_dirt(c, lnum, dlen2); + if (err) + goto out_ro; + + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + if (err) + goto out_ro; + } - offs += aligned_dlen1 + aligned_dlen2; + offs += aligned_dlen2; if (new_inode) { ino_key_init(c, &key, new_inode->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, ilen); @@ -1058,7 +1216,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (err) goto out_ro; - if (old_dir != new_dir) { + if (move) { offs += ALIGN(plen, 8); ino_key_init(c, &key, new_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen); diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index a0011aa..6c3a1ab 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -636,7 +636,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, /** * ubifs_get_lp_stats - get lprops statistics. * @c: UBIFS file-system description object - * @st: return statistics + * @lst: return statistics */ void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) { diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ce89bdc..235654c 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -34,7 +34,6 @@ static int dbg_populate_lsave(struct ubifs_info *c); /** * first_dirty_cnode - find first dirty cnode. - * @c: UBIFS file-system description object * @nnode: nnode at which to start * * This function returns the first dirty cnode or %NULL if there is not one. @@ -1623,7 +1622,6 @@ static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum, * dbg_check_ltab_lnum - check the ltab for a LPT LEB number. * @c: the UBIFS file-system description object * @lnum: LEB number where node was written - * @offs: offset where node was written * * This function returns %0 on success and a negative error code on failure. */ @@ -1870,7 +1868,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) } /** - * ubifs_dump_lpt_leb - dump an LPT LEB. + * dump_lpt_leb - dump an LPT LEB. * @c: UBIFS file-system description object * @lnum: LEB number to dump * diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 3ca4540..fb0f44c 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -267,7 +267,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) * replay_entries_cmp - compare 2 replay entries. * @priv: UBIFS file-system description object * @a: first replay entry - * @a: second replay entry + * @b: second replay entry * * This is a comparios function for 'list_sort()' which compares 2 replay * entries @a and @b by comparing their sequence numer. Returns %1 if @a has diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4617d45..096035e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -157,6 +157,7 @@ enum { WB_MUTEX_1 = 0, WB_MUTEX_2 = 1, WB_MUTEX_3 = 2, + WB_MUTEX_4 = 3, }; /* @@ -1520,10 +1521,15 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, + const struct dentry *fst_dentry, + const struct inode *snd_dir, + const struct dentry *snd_dentry, int sync); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct dentry *old_dentry, const struct inode *new_dir, - const struct dentry *new_dentry, int sync); + const struct dentry *new_dentry, + const struct inode *whiteout, int sync); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 11a0041..d9f9615 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -172,6 +172,7 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_names -= nm->len; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -200,6 +201,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_inode *ui = ubifs_inode(inode); void *buf = NULL; + int old_size; struct ubifs_budget_req req = { .dirtied_ino = 2, .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; @@ -217,12 +219,13 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, kfree(ui->data); ui->data = buf; inode->i_size = ui->ui_size = size; + old_size = ui->data_len; ui->data_len = size; mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); - host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); /* @@ -241,7 +244,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_size -= CALC_XATTR_BYTES(size); - host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(old_size); mutex_unlock(&host_ui->ui_mutex); make_bad_inode(inode); out_free: @@ -476,6 +479,7 @@ out_cancel: host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names += nm->len; mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); diff --git a/fs/udf/file.c b/fs/udf/file.c index 6325706..dbcb3a4a 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -94,7 +94,7 @@ static int udf_adinicb_write_begin(struct file *file, return -ENOMEM; *pagep = page; - if (!PageUptodate(page) && len != PAGE_SIZE) + if (!PageUptodate(page)) __udf_adinicb_readpage(page); return 0; } @@ -105,11 +105,25 @@ static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; } +static int udf_adinicb_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + set_page_dirty(page); + unlock_page(page); + put_page(page); + return copied; +} + const struct address_space_operations udf_adinicb_aops = { .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, - .write_end = simple_write_end, + .write_end = udf_adinicb_write_end, .direct_IO = udf_adinicb_direct_IO, }; @@ -247,7 +261,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) struct inode *inode = d_inode(dentry); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index e77db62..c1ed18a 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -121,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; inode->i_mtime = inode->i_atime = inode->i_ctime = - iinfo->i_crtime = current_fs_time(inode->i_sb); + iinfo->i_crtime = current_time(inode); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 55aa587..aad4640 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -886,7 +886,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, *new = 1; iinfo->i_next_alloc_block = block; iinfo->i_next_alloc_goal = newblocknum; - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); @@ -1268,7 +1268,7 @@ set_size: up_write(&iinfo->i_data_sem); } update_time: - inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_mtime = inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c3e5c96..2d65e28 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -616,7 +616,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -730,7 +730,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); inc_nlink(dir); - dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -845,7 +845,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; inode_dec_link_count(dir); inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_fs_time(dir->i_sb); + current_time(inode); mark_inode_dirty(dir); end_rmdir: @@ -888,7 +888,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = udf_delete_entry(dir, fi, &fibh, &cfi); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); inode->i_ctime = dir->i_ctime; @@ -1079,9 +1079,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, brelse(fibh.ebh); brelse(fibh.sbh); inc_nlink(inode); - inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); - dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + dir->i_ctime = dir->i_mtime = current_time(dir); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -1093,7 +1093,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, * higher-level routines. */ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -1105,6 +1106,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct kernel_lb_addr tloc; struct udf_inode_info *old_iinfo = UDF_I(old_inode); + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); if (IS_ERR(ofi)) { retval = PTR_ERR(ofi); @@ -1172,7 +1176,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_fs_time(old_inode->i_sb); + old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); /* @@ -1188,11 +1192,11 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, udf_delete_entry(old_dir, ofi, &ofibh, &ocfi); if (new_inode) { - new_inode->i_ctime = current_fs_time(new_inode->i_sb); + new_inode->i_ctime = current_time(new_inode); inode_dec_link_count(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb); - new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index fa3bda1..de01b8f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -100,7 +100,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); } @@ -389,7 +389,7 @@ got_it: ufs_set_de_type(sb, de, inode->i_mode); err = ufs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); /* OFFSET_CACHE */ @@ -530,7 +530,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; err = ufs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); out: ufs_put_page(page); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index fd0203c..9774555 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -290,7 +290,7 @@ cg_found: inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; ufsi->i_shadow = 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9f49431..190d64b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -293,7 +293,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, if (new) *new = 1; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); @@ -375,7 +375,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: brelse (bh); @@ -1185,7 +1185,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); __ufs_truncate_blocks(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); @@ -1208,7 +1208,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index a1559f7..8eca4ed 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); inode_inc_link_count(inode); ihold(inode); @@ -245,7 +245,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) } static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -255,6 +256,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct ufs_dir_entry *old_de; int err = -ENOENT; + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_de) goto out; @@ -279,7 +283,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_de) goto out_dir; ufs_set_link(new_dir, new_de, new_page, old_inode, 1); - new_inode->i_ctime = CURRENT_TIME_SEC; + new_inode->i_ctime = current_time(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -295,7 +299,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = CURRENT_TIME_SEC; + old_inode->i_ctime = current_time(old_inode); ufs_delete_entry(old_dir, old_de, old_page); mark_inode_dirty(old_inode); diff --git a/fs/utimes.c b/fs/utimes.c index 794f5f5..22307cd 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -81,27 +81,13 @@ static int utimes_common(struct path *path, struct timespec *times) newattrs.ia_valid |= ATTR_MTIME_SET; } /* - * Tell inode_change_ok(), that this is an explicit time + * Tell setattr_prepare(), that this is an explicit time * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET * were used. */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { - /* - * If times is NULL (or both times are UTIME_NOW), - * then we need to check permissions, because - * inode_change_ok() won't do it. - */ - error = -EPERM; - if (IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - - error = -EACCES; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); @@ -113,7 +99,6 @@ retry_deleg: goto retry_deleg; } -mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; @@ -24,6 +24,59 @@ #include <asm/uaccess.h> +static const char * +strcmp_prefix(const char *a, const char *a_prefix) +{ + while (*a_prefix && *a == *a_prefix) { + a++; + a_prefix++; + } + return *a_prefix ? NULL : a; +} + +/* + * In order to implement different sets of xattr operations for each xattr + * prefix, a filesystem should create a null-terminated array of struct + * xattr_handler (one for each prefix) and hang a pointer to it off of the + * s_xattr field of the superblock. + */ +#define for_each_xattr_handler(handlers, handler) \ + if (handlers) \ + for ((handler) = *(handlers)++; \ + (handler) != NULL; \ + (handler) = *(handlers)++) + +/* + * Find the xattr_handler with the matching prefix. + */ +static const struct xattr_handler * +xattr_resolve_name(struct inode *inode, const char **name) +{ + const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler *handler; + + if (!(inode->i_opflags & IOP_XATTR)) { + if (unlikely(is_bad_inode(inode))) + return ERR_PTR(-EIO); + return ERR_PTR(-EOPNOTSUPP); + } + for_each_xattr_handler(handlers, handler) { + const char *n; + + n = strcmp_prefix(*name, xattr_prefix(handler)); + if (n) { + if (!handler->prefix ^ !*n) { + if (*n) + continue; + return ERR_PTR(-EINVAL); + } + *name = n; + return handler; + } + } + return ERR_PTR(-EOPNOTSUPP); +} + /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. @@ -80,6 +133,23 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } +int +__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(inode, &name); + if (IS_ERR(handler)) + return PTR_ERR(handler); + if (!handler->set) + return -EOPNOTSUPP; + if (size == 0) + value = ""; /* empty EA, do not remove */ + return handler->set(handler, dentry, inode, name, value, size, flags); +} +EXPORT_SYMBOL(__vfs_setxattr); + /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. @@ -106,8 +176,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; - if (inode->i_op->setxattr) { - error = inode->i_op->setxattr(dentry, inode, name, value, size, flags); + if (inode->i_opflags & IOP_XATTR) { + error = __vfs_setxattr(dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -115,6 +185,9 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, } } else if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + + if (unlikely(is_bad_inode(inode))) + return -EIO; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) @@ -188,6 +261,7 @@ ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { + const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; @@ -196,10 +270,12 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, if (error) return error; - if (!inode->i_op->getxattr) + handler = xattr_resolve_name(inode, &name); + if (IS_ERR(handler)) + return PTR_ERR(handler); + if (!handler->get) return -EOPNOTSUPP; - - error = inode->i_op->getxattr(dentry, inode, name, NULL, 0); + error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; @@ -210,12 +286,27 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, memset(value, 0, error + 1); } - error = inode->i_op->getxattr(dentry, inode, name, value, error); + error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t +__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size) +{ + const struct xattr_handler *handler; + + handler = xattr_resolve_name(inode, &name); + if (IS_ERR(handler)) + return PTR_ERR(handler); + if (!handler->get) + return -EOPNOTSUPP; + return handler->get(handler, dentry, inode, name, value, size); +} +EXPORT_SYMBOL(__vfs_getxattr); + +ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; @@ -242,28 +333,24 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) return ret; } nolsm: - if (inode->i_op->getxattr) - error = inode->i_op->getxattr(dentry, inode, name, value, size); - else - error = -EOPNOTSUPP; - - return error; + return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); ssize_t -vfs_listxattr(struct dentry *d, char *list, size_t size) +vfs_listxattr(struct dentry *dentry, char *list, size_t size) { + struct inode *inode = d_inode(dentry); ssize_t error; - error = security_inode_listxattr(d); + error = security_inode_listxattr(dentry); if (error) return error; - error = -EOPNOTSUPP; - if (d->d_inode->i_op->listxattr) { - error = d->d_inode->i_op->listxattr(d, list, size); + if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { + error = -EOPNOTSUPP; + error = inode->i_op->listxattr(dentry, list, size); } else { - error = security_inode_listsecurity(d->d_inode, list, size); + error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } @@ -272,14 +359,26 @@ vfs_listxattr(struct dentry *d, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int +__vfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = d_inode(dentry); + const struct xattr_handler *handler; + + handler = xattr_resolve_name(inode, &name); + if (IS_ERR(handler)) + return PTR_ERR(handler); + if (!handler->set) + return -EOPNOTSUPP; + return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE); +} +EXPORT_SYMBOL(__vfs_removexattr); + +int vfs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; int error; - if (!inode->i_op->removexattr) - return -EOPNOTSUPP; - error = xattr_permission(inode, name, MAY_WRITE); if (error) return error; @@ -289,7 +388,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) goto out; - error = inode->i_op->removexattr(dentry, name); + error = __vfs_removexattr(dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -641,76 +740,6 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) return error; } - -static const char * -strcmp_prefix(const char *a, const char *a_prefix) -{ - while (*a_prefix && *a == *a_prefix) { - a++; - a_prefix++; - } - return *a_prefix ? NULL : a; -} - -/* - * In order to implement different sets of xattr operations for each xattr - * prefix with the generic xattr API, a filesystem should create a - * null-terminated array of struct xattr_handler (one for each prefix) and - * hang a pointer to it off of the s_xattr field of the superblock. - * - * The generic_fooxattr() functions will use this list to dispatch xattr - * operations to the correct xattr_handler. - */ -#define for_each_xattr_handler(handlers, handler) \ - if (handlers) \ - for ((handler) = *(handlers)++; \ - (handler) != NULL; \ - (handler) = *(handlers)++) - -/* - * Find the xattr_handler with the matching prefix. - */ -static const struct xattr_handler * -xattr_resolve_name(const struct xattr_handler **handlers, const char **name) -{ - const struct xattr_handler *handler; - - if (!*name) - return ERR_PTR(-EINVAL); - - for_each_xattr_handler(handlers, handler) { - const char *n; - - n = strcmp_prefix(*name, xattr_prefix(handler)); - if (n) { - if (!handler->prefix ^ !*n) { - if (*n) - continue; - return ERR_PTR(-EINVAL); - } - *name = n; - return handler; - } - } - return ERR_PTR(-EOPNOTSUPP); -} - -/* - * Find the handler for the prefix and dispatch its get() operation. - */ -ssize_t -generic_getxattr(struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - const struct xattr_handler *handler; - - handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (IS_ERR(handler)) - return PTR_ERR(handler); - return handler->get(handler, dentry, inode, - name, buffer, size); -} - /* * Combine the results of the list() operation from every xattr_handler in the * list. @@ -747,44 +776,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } return size; } - -/* - * Find the handler for the prefix and dispatch its set() operation. - */ -int -generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - const struct xattr_handler *handler; - - if (size == 0) - value = ""; /* empty EA, do not remove */ - handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (IS_ERR(handler)) - return PTR_ERR(handler); - return handler->set(handler, dentry, inode, name, value, size, flags); -} - -/* - * Find the handler for the prefix and dispatch its set() operation to remove - * any associated extended attribute. - */ -int -generic_removexattr(struct dentry *dentry, const char *name) -{ - const struct xattr_handler *handler; - - handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (IS_ERR(handler)) - return PTR_ERR(handler); - return handler->set(handler, dentry, d_inode(dentry), name, NULL, - 0, XATTR_REPLACE); -} - -EXPORT_SYMBOL(generic_getxattr); EXPORT_SYMBOL(generic_listxattr); -EXPORT_SYMBOL(generic_setxattr); -EXPORT_SYMBOL(generic_removexattr); /** * xattr_full_name - Compute full attribute name from suffix diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fc593c8..26ef195 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -52,8 +52,11 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ + xfs_refcount.o \ + xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -87,6 +90,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ @@ -99,16 +103,20 @@ xfs-y += xfs_aops.o \ # low-level transaction/log code xfs-y += xfs_log.o \ xfs_log_cil.o \ + xfs_bmap_item.o \ xfs_buf_item.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ + xfs_trans_bmap.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_refcount.o \ xfs_trans_rmap.o \ # optional features diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c new file mode 100644 index 0000000..e5ebc37 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_rmap_btree.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" + +/* + * Per-AG Block Reservations + * + * For some kinds of allocation group metadata structures, it is advantageous + * to reserve a small number of blocks in each AG so that future expansions of + * that data structure do not encounter ENOSPC because errors during a btree + * split cause the filesystem to go offline. + * + * Prior to the introduction of reflink, this wasn't an issue because the free + * space btrees maintain a reserve of space (the AGFL) to handle any expansion + * that may be necessary; and allocations of other metadata (inodes, BMBT, + * dir/attr) aren't restricted to a single AG. However, with reflink it is + * possible to allocate all the space in an AG, have subsequent reflink/CoW + * activity expand the refcount btree, and discover that there's no space left + * to handle that expansion. Since we can calculate the maximum size of the + * refcount btree, we can reserve space for it and avoid ENOSPC. + * + * Handling per-AG reservations consists of three changes to the allocator's + * behavior: First, because these reservations are always needed, we decrease + * the ag_max_usable counter to reflect the size of the AG after the reserved + * blocks are taken. Second, the reservations must be reflected in the + * fdblocks count to maintain proper accounting. Third, each AG must maintain + * its own reserved block counter so that we can calculate the amount of space + * that must remain free to maintain the reservations. Fourth, the "remaining + * reserved blocks" count must be used when calculating the length of the + * longest free extent in an AG and to clamp maxlen in the per-AG allocation + * functions. In other words, we maintain a virtual allocation via in-core + * accounting tricks so that we don't have to clean up after a crash. :) + * + * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type + * values via struct xfs_alloc_arg or directly to the xfs_free_extent + * function. It might seem a little funny to maintain a reservoir of blocks + * to feed another reservoir, but the AGFL only holds enough blocks to get + * through the next transaction. The per-AG reservation is to ensure (we + * hope) that each AG never runs out of blocks. Each data structure wanting + * to use the reservation system should update ask/used in xfs_ag_resv_init. + */ + +/* + * Are we critically low on blocks? For now we'll define that as the number + * of blocks we can get our hands on being less than 10% of what we reserved + * or less than some arbitrary number (maximum btree height). + */ +bool +xfs_ag_resv_critical( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t avail; + xfs_extlen_t orig; + + switch (type) { + case XFS_AG_RESV_METADATA: + avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + orig = pag->pag_meta_resv.ar_asked; + break; + case XFS_AG_RESV_AGFL: + avail = pag->pagf_freeblks + pag->pagf_flcount - + pag->pag_meta_resv.ar_reserved; + orig = pag->pag_agfl_resv.ar_asked; + break; + default: + ASSERT(0); + return false; + } + + trace_xfs_ag_resv_critical(pag, type, avail); + + /* Critically low if less than 10% or max btree height remains. */ + return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, + XFS_RANDOM_AG_RESV_CRITICAL); +} + +/* + * How many blocks are reserved but not used, and therefore must not be + * allocated away? + */ +xfs_extlen_t +xfs_ag_resv_needed( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t len; + + len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + len -= xfs_perag_resv(pag, type)->ar_reserved; + break; + case XFS_AG_RESV_NONE: + /* empty */ + break; + default: + ASSERT(0); + } + + trace_xfs_ag_resv_needed(pag, type, len); + + return len; +} + +/* Clean out a reservation */ +static int +__xfs_ag_resv_free( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t oldresv; + int error; + + trace_xfs_ag_resv_free(pag, type, 0); + + resv = xfs_perag_resv(pag, type); + pag->pag_mount->m_ag_max_usable += resv->ar_asked; + /* + * AGFL blocks are always considered "free", so whatever + * was reserved at mount time must be given back at umount. + */ + if (type == XFS_AG_RESV_AGFL) + oldresv = resv->ar_orig_reserved; + else + oldresv = resv->ar_reserved; + error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + resv->ar_reserved = 0; + resv->ar_asked = 0; + + if (error) + trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + return error; +} + +/* Free a per-AG reservation. */ +int +xfs_ag_resv_free( + struct xfs_perag *pag) +{ + int error; + int err2; + + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); + if (err2 && !error) + error = err2; + return error; +} + +static int +__xfs_ag_resv_init( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + xfs_extlen_t ask, + xfs_extlen_t used) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_ag_resv *resv; + int error; + + resv = xfs_perag_resv(pag, type); + if (used > ask) + ask = used; + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = ask - used; + mp->m_ag_max_usable -= ask; + + trace_xfs_ag_resv_init(pag, type, ask); + + error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); + if (error) + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + + return error; +} + +/* Create a per-AG block reservation. */ +int +xfs_ag_resv_init( + struct xfs_perag *pag) +{ + xfs_extlen_t ask; + xfs_extlen_t used; + int error = 0; + + /* Create the metadata reservation. */ + if (pag->pag_meta_resv.ar_asked == 0) { + ask = used = 0; + + error = xfs_refcountbt_calc_reserves(pag->pag_mount, + pag->pag_agno, &ask, &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } + + /* Create the AGFL metadata reservation */ + if (pag->pag_agfl_resv.ar_asked == 0) { + ask = used = 0; + + error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, + &ask, &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + if (error) + goto out; + } + +out: + return error; +} + +/* Allocate a block from the reservation. */ +void +xfs_ag_resv_alloc_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t len; + uint field; + + trace_xfs_ag_resv_alloc_extent(pag, type, args->len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS; + xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); + return; + } + + len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); + resv->ar_reserved -= len; + if (type == XFS_AG_RESV_AGFL) + return; + /* Allocations of reserved blocks only need on-disk sb updates... */ + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); + /* ...but non-reserved blocks need in-core and on-disk updates. */ + if (args->len > len) + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, + -((int64_t)args->len - len)); +} + +/* Free a block to the reservation. */ +void +xfs_ag_resv_free_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_trans *tp, + xfs_extlen_t len) +{ + xfs_extlen_t leftover; + struct xfs_ag_resv *resv; + + trace_xfs_ag_resv_free_extent(pag, type, len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + return; + } + + leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); + resv->ar_reserved += leftover; + if (type == XFS_AG_RESV_AGFL) + return; + /* Freeing into the reserved pool only requires on-disk update... */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); + /* ...but freeing beyond that requires in-core and on-disk update. */ + if (len > leftover) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); +} diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h new file mode 100644 index 0000000..8d6c687 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_AG_RESV_H__ +#define __XFS_AG_RESV_H__ + +int xfs_ag_resv_free(struct xfs_perag *pag); +int xfs_ag_resv_init(struct xfs_perag *pag); + +bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); +xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, + enum xfs_ag_resv_type type); + +void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args); +void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_trans *tp, xfs_extlen_t len); + +#endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 05b5243..effb64c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -37,6 +37,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag_resv.h" struct workqueue_struct *xfs_alloc_wq; @@ -51,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +unsigned int +xfs_refc_block( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { + if (xfs_sb_version_hasreflink(&mp->m_sb)) + return xfs_refc_block(mp) + 1; if (xfs_sb_version_hasrmapbt(&mp->m_sb)) return XFS_RMAP_BLOCK(mp) + 1; if (xfs_sb_version_hasfinobt(&mp->m_sb)) @@ -74,14 +88,8 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist - * and 4 more to handle a potential split of the file's bmap btree. - * - * When rmap is enabled, we must also be able to handle two rmap btree inserts - * to record both the file data extent and a new bmbt block. The bmbt block - * might not be in the same AG as the file data extent. In the worst case - * the bmap btree splits multiple levels and all the new blocks come from - * different AGs, so set aside enough to handle rmap btree splits in all AGs. + * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a + * potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( @@ -90,8 +98,6 @@ xfs_alloc_set_aside( unsigned int blocks; blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; return blocks; } @@ -122,6 +128,8 @@ xfs_alloc_ag_max_usable( blocks++; /* finobt root block */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks++; /* rmap root block */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; } @@ -265,7 +273,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ - char userdata, /* are we allocating data? */ + int datatype, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -276,6 +284,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ + bool userdata = xfs_alloc_is_userdata(datatype); ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -680,12 +689,29 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; + xfs_extlen_t reservation; + xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); ASSERT(args->minlen <= args->maxlen); ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); + + /* + * Clamp maxlen to the amount of free space minus any reservations + * that have been made. + */ + oldmax = args->maxlen; + reservation = xfs_ag_resv_needed(args->pag, args->resv); + if (args->maxlen > args->pag->pagf_freeblks - reservation) + args->maxlen = args->pag->pagf_freeblks - reservation; + if (args->maxlen == 0) { + args->agbno = NULLAGBLOCK; + args->maxlen = oldmax; + return 0; + } + /* * Branch to correct routine based on the type. */ @@ -705,12 +731,14 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } + args->maxlen = oldmax; + if (error || args->agbno == NULLAGBLOCK) return error; ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -732,12 +760,7 @@ xfs_alloc_ag_vextent( args->agbno, args->len)); } - if (!args->isfl) { - xfs_trans_mod_sb(args->tp, args->wasdel ? - XFS_TRANS_SB_RES_FDBLOCKS : - XFS_TRANS_SB_FDBLOCKS, - -((long)(args->len))); - } + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); XFS_STATS_INC(args->mp, xs_allocx); XFS_STATS_ADD(args->mp, xs_allocb, args->len); @@ -917,7 +940,7 @@ xfs_alloc_find_best_extent( sdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, - args->userdata, *sbnoa, + args->datatype, *sbnoa, *slena, &new); /* @@ -1101,7 +1124,7 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { @@ -1254,7 +1277,7 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); error = xfs_alloc_find_best_extent(args, @@ -1271,7 +1294,7 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, gtbnoa, + args->alignment, args->datatype, gtbnoa, gtlena, >new); error = xfs_alloc_find_best_extent(args, @@ -1331,7 +1354,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->userdata, ltbnoa, ltlena, <new); + args->datatype, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1583,6 +1606,7 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; + struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1600,7 +1624,8 @@ xfs_alloc_ag_vextent_small( * to respect minleft even when pulling from the * freelist. */ - else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + else if (args->minlen == 1 && args->alignment == 1 && + args->resv != XFS_AG_RESV_AGFL && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1608,9 +1633,9 @@ xfs_alloc_ag_vextent_small( goto error0; if (fbno != NULLAGBLOCK) { xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - args->userdata); + xfs_alloc_allow_busy_reuse(args->datatype)); - if (args->userdata) { + if (xfs_alloc_is_userdata(args->datatype)) { xfs_buf_t *bp; bp = xfs_btree_get_bufs(args->mp, args->tp, @@ -1629,13 +1654,18 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. + * out the OWN_AG rmap and add the block back to + * the AGFL per-AG reservation. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; + pag = xfs_perag_get(args->mp, args->agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, + args->tp, 1); + xfs_perag_put(pag); *stat = 0; return 0; @@ -1683,7 +1713,7 @@ xfs_free_ag_extent( xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, - int isfl) + enum xfs_ag_resv_type type) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1911,21 +1941,22 @@ xfs_free_ag_extent( */ pag = xfs_perag_get(mp, agno); error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); xfs_perag_put(pag); if (error) goto error0; - if (!isfl) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1950,21 +1981,43 @@ xfs_alloc_compute_maxlevels( } /* - * Find the length of the longest extent in an AG. + * Find the length of the longest extent in an AG. The 'need' parameter + * specifies how much space we're going to need for the AGFL and the + * 'reserved' parameter tells us how many blocks in this AG are reserved for + * other callers. */ xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, struct xfs_perag *pag, - xfs_extlen_t need) + xfs_extlen_t need, + xfs_extlen_t reserved) { xfs_extlen_t delta = 0; + /* + * If the AGFL needs a recharge, we'll have to subtract that from the + * longest extent. + */ if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; + /* + * If we cannot maintain others' reservations with space from the + * not-longest freesp extents, we'll have to subtract /that/ from + * the longest extent too. + */ + if (pag->pagf_freeblks - pag->pagf_longest < reserved) + delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); + + /* + * If the longest extent is long enough to satisfy all the + * reservations and AGFL rules in place, we can return this extent. + */ if (pag->pagf_longest > delta) return pag->pagf_longest - delta; + + /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } @@ -2004,20 +2057,24 @@ xfs_alloc_space_available( { struct xfs_perag *pag = args->pag; xfs_extlen_t longest; + xfs_extlen_t reservation; /* blocks that are still reserved */ int available; if (flags & XFS_ALLOC_FLAG_FREEING) return true; + reservation = xfs_ag_resv_needed(pag, args->resv); + /* do we have enough contiguous free space for the allocation? */ - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, + reservation); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) return false; - /* do have enough free space remaining for the allocation? */ + /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - min_free - args->total); - if (available < (int)args->minleft) + reservation - min_free - args->total); + if (available < (int)args->minleft || available <= 0) return false; return true; @@ -2058,7 +2115,7 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && args->userdata && + if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2124,7 +2181,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, 1); + &targs.oinfo, XFS_AG_RESV_AGFL); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2135,7 +2192,7 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); @@ -2146,6 +2203,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; + targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2278,6 +2336,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), offsetof(xfs_agf_t, agf_rmap_blocks), + offsetof(xfs_agf_t, agf_refcount_blocks), + offsetof(xfs_agf_t, agf_refcount_root), + offsetof(xfs_agf_t, agf_refcount_level), /* needed so that we don't log the whole rest of the structure: */ offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) @@ -2415,6 +2476,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return false; + if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + return false; + return true;; } @@ -2535,6 +2600,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; @@ -2633,7 +2699,7 @@ xfs_alloc_vextent( * Try near allocation first, then anywhere-in-ag after * the first a.g. fails. */ - if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && (mp->m_flags & XFS_MOUNT_32BITINODES)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % @@ -2766,7 +2832,7 @@ xfs_alloc_vextent( #endif /* Zero the extent if we were asked to do so */ - if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(args->ip, args->fsbno, args->len); if (error) goto error0; @@ -2825,7 +2891,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo) /* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type) /* block reservation type */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2834,6 +2901,7 @@ xfs_free_extent( int error; ASSERT(len != 0); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT, @@ -2851,7 +2919,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6fe2d6b..7c404a6 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg { xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ + int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ - char isfl; /* set if is freelist blocks - !acctg */ - char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ + enum xfs_ag_resv_type resv; /* block reservation to use */ } xfs_alloc_arg_t; /* - * Defines for userdata + * Defines for datatype */ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ + +static inline bool +xfs_alloc_is_userdata(int datatype) +{ + return (datatype & ~XFS_ALLOC_NOBUSY) != 0; +} + +static inline bool +xfs_alloc_allow_busy_reuse(int datatype) +{ + return (datatype & XFS_ALLOC_NOBUSY) == 0; +} /* freespace limit calculations */ #define XFS_ALLOC_AGFL_RESERVE 4 @@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need); + struct xfs_perag *pag, xfs_extlen_t need, + xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); @@ -184,7 +198,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo);/* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ xfs_alloc_lookup_ge( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b060bca..c6eb219 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -47,6 +47,8 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" +#include "xfs_refcount.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -139,7 +141,8 @@ xfs_bmbt_lookup_ge( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork); } @@ -149,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork); } @@ -639,6 +643,7 @@ xfs_bmap_btree_to_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); rblock = ifp->if_broot; @@ -705,6 +710,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_ptr_t *pp; /* root block address pointer */ mp = ip->i_mount; + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -747,6 +753,7 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -761,6 +768,21 @@ xfs_bmap_extents_to_btree( xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } + + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + dfops->dop_low = true; + goto try_another_ag; + } /* * Allocation can't fail, the space was reserved. */ @@ -836,6 +858,7 @@ xfs_bmap_local_to_extents_empty( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); @@ -895,6 +918,7 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { +try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -907,6 +931,19 @@ xfs_bmap_local_to_extents( if (error) goto done; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + goto try_another_ag; + } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1388,7 +1425,7 @@ xfs_bmap_search_multi_extents( * Else, *lastxp will be set to the index of the found * entry; *gotp will contain the entry. */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -1669,7 +1706,8 @@ xfs_bmap_one_block( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_bmalloca *bma) + struct xfs_bmalloca *bma, + int whichfork) { struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ @@ -1687,11 +1725,14 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; + xfs_extnum_t *nextents; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); + ASSERT(whichfork != XFS_ATTR_FORK); + nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : + &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1705,6 +1746,9 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + /* * Set up a bunch of variables to make the tests simpler. */ @@ -1791,7 +1835,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); - bma->ip->i_d.di_nextents--; + (*nextents)--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1893,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, new->br_startblock); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1963,7 +2007,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2047,7 +2091,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2116,7 +2160,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2214,7 +2258,8 @@ xfs_bmap_add_extent_delay_real( xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - bma->logflags |= rval; + if (whichfork != XFS_COW_FORK) + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -2758,6 +2803,7 @@ done: STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new) /* new data to add to file extents */ { @@ -2769,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); state = 0; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2788,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2922,6 +2970,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); @@ -3347,7 +3396,8 @@ xfs_bmap_adjacent( mp = ap->ip->i_mount; nullfb = *ap->firstblock == NULLFSBLOCK; - rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + rt = XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype); fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, @@ -3501,7 +3551,8 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3622,7 +3673,7 @@ xfs_bmap_btalloc( { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t align = 0; /* minimum allocation alignment */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; @@ -3645,7 +3696,10 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else if (xfs_alloc_is_userdata(ap->datatype)) + align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, @@ -3658,7 +3712,8 @@ xfs_bmap_btalloc( nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); @@ -3698,7 +3753,8 @@ xfs_bmap_btalloc( * enough for the request. If one isn't found, then adjust * the minimum allocation size to the largest space found. */ - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); @@ -3781,9 +3837,9 @@ xfs_bmap_btalloc( } args.minleft = ap->minleft; args.wasdel = ap->wasdel; - args.isfl = 0; - args.userdata = ap->userdata; - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) args.ip = ap->ip; error = xfs_alloc_vextent(&args); @@ -3850,7 +3906,8 @@ xfs_bmap_btalloc( ASSERT(nullfb || fb_agno == args.agno || (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; - ap->ip->i_d.di_nblocks += args.len; + if (!(ap->flags & XFS_BMAPI_COWFORK)) + ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) ap->ip->i_delayed_blks -= args.len; @@ -3870,6 +3927,60 @@ xfs_bmap_btalloc( } /* + * For a remap operation, just "allocate" an extent at the address that the + * caller passed in, and ensure that the AGFL is the right size. The caller + * will then map the "allocated" extent into the file somewhere. + */ +STATIC int +xfs_bmap_remap_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_trans *tp = ap->tp; + struct xfs_mount *mp = tp->t_mountp; + xfs_agblock_t bno; + struct xfs_alloc_arg args; + int error; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + memset(&args, 0, sizeof(struct xfs_alloc_arg)); + args.tp = ap->tp; + args.mp = ap->tp->t_mountp; + bno = *ap->firstblock; + args.agno = XFS_FSB_TO_AGNO(mp, bno); + args.agbno = XFS_FSB_TO_AGBNO(mp, bno); + if (args.agno >= mp->m_sb.sb_agcount || + args.agbno >= mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + /* "Allocate" the extent from the range we passed in. */ + trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); + ap->blkno = bno; + ap->ip->i_d.di_nblocks += ap->length; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + + /* Fix the freelist, like a real allocator does. */ + args.datatype = ap->datatype; + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + /* + * The freelist fixing code will decline the allocation if + * the size and shape of the free space doesn't allow for + * allocating the extent and updating all the metadata that + * happens during an allocation. We're remapping, not + * allocating, so skip that check by pretending to be freeing. + */ + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + xfs_perag_put(args.pag); + if (error) + trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); + return error; +} + +/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3877,11 +3988,47 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + if (ap->flags & XFS_BMAPI_REMAP) + return xfs_bmap_remap_alloc(ap); + if (XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); return xfs_bmap_btalloc(ap); } +/* Trim extent to fit a logical block range. */ +void +xfs_trim_extent( + struct xfs_bmbt_irec *irec, + xfs_fileoff_t bno, + xfs_filblks_t len) +{ + xfs_fileoff_t distance; + xfs_fileoff_t end = bno + len; + + if (irec->br_startoff + irec->br_blockcount <= bno || + irec->br_startoff >= end) { + irec->br_blockcount = 0; + return; + } + + if (irec->br_startoff < bno) { + distance = bno - irec->br_startoff; + if (isnullstartblock(irec->br_startblock)) + irec->br_startblock = DELAYSTARTBLOCK; + if (irec->br_startblock != DELAYSTARTBLOCK && + irec->br_startblock != HOLESTARTBLOCK) + irec->br_startblock += distance; + irec->br_startoff += distance; + irec->br_blockcount -= distance; + } + + if (end < irec->br_startoff + irec->br_blockcount) { + distance = irec->br_startoff + irec->br_blockcount - end; + irec->br_blockcount -= distance; + } +} + /* * Trim the returned map to the required bounds */ @@ -4005,12 +4152,11 @@ xfs_bmapi_read( int error; int eof; int n = 0; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_IGSTATE))); + XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( @@ -4028,6 +4174,16 @@ xfs_bmapi_read( ifp = XFS_IFORK_PTR(ip, whichfork); + /* No CoW fork? Return a hole. */ + if (whichfork == XFS_COW_FORK && !ifp) { + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = len; + mval->br_state = XFS_EXT_NORM; + *nmap = 1; + return 0; + } + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4074,9 +4230,10 @@ xfs_bmapi_read( return 0; } -STATIC int +int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, + int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, @@ -4085,7 +4242,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; char rt = XFS_IS_REALTIME_INODE(ip); @@ -4097,7 +4254,10 @@ xfs_bmapi_reserve_delalloc( alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); + if (whichfork == XFS_COW_FORK) + extsz = xfs_get_cowextsz_hint(ip); + else + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); @@ -4144,7 +4304,7 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay @@ -4170,98 +4330,12 @@ out_unreserve_quota: return error; } -/* - * Map file blocks to filesystem blocks, adding delayed allocations as needed. - */ -int -xfs_bmapi_delay( - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - int flags) /* XFS_BMAPI_... */ -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - struct xfs_bmbt_irec got; /* current file extent record */ - struct xfs_bmbt_irec prev; /* previous file extent record */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_fileoff_t end; /* end of mapped file region */ - xfs_extnum_t lastx; /* last useful extent number */ - int eof; /* we've hit the end of extents */ - int n = 0; /* current extent index */ - int error = 0; - - ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - XFS_STATS_INC(mp, xs_blk_mapw); - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - return error; - } - - xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); - end = bno + len; - obno = bno; - - while (bno < end && n < *nmap) { - if (eof || got.br_startoff > bno) { - error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, - &prev, &lastx, eof); - if (error) { - if (n == 0) { - *nmap = 0; - return error; - } - break; - } - } - - /* set up the extent map to return. */ - xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); - xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - - /* If we're done, stop now. */ - if (bno >= end || n >= *nmap) - break; - - /* Else go on to the next record. */ - prev = got; - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; - } - - *nmap = n; - return 0; -} - - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4287,15 +4361,21 @@ xfs_bmapi_allocate( } /* - * Indicate if this is the first user data in the file, or just any - * user data. And if it is userdata, indicate whether it needs to - * be initialised to zero during allocation. + * Set the data type being allocated. For the data fork, the first data + * in the file is treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range is not on + * the busy list. */ if (!(bma->flags & XFS_BMAPI_METADATA)) { - bma->userdata = (bma->offset == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) - bma->userdata |= XFS_ALLOC_USERDATA_ZERO; + bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4350,7 +4430,7 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) - error = xfs_bmap_add_extent_delay_real(bma); + error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma, whichfork); @@ -4380,8 +4460,7 @@ xfs_bmapi_convert_unwritten( xfs_filblks_t len, int flags) { - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4397,6 +4476,8 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; + ASSERT(whichfork != XFS_COW_FORK); + /* * Modify (by adding) the state flag, if writing. */ @@ -4503,8 +4584,7 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); @@ -4513,6 +4593,11 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4565,7 +4650,7 @@ xfs_bmapi_write( bma.tp = tp; bma.ip = ip; bma.total = total; - bma.userdata = 0; + bma.datatype = 0; bma.dfops = dfops; bma.firstblock = firstblock; @@ -4574,6 +4659,14 @@ xfs_bmapi_write( wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); /* + * Make sure we only reflink into a hole. + */ + if (flags & XFS_BMAPI_REMAP) + ASSERT(inhole); + if (flags & XFS_BMAPI_COWFORK) + ASSERT(!inhole); + + /* * First, deal with the hole before the allocated space * that we found, if any. */ @@ -4603,6 +4696,17 @@ xfs_bmapi_write( goto error0; if (bma.blkno == NULLFSBLOCK) break; + + /* + * If this is a CoW allocation, record the data in + * the refcount btree for orphan recovery. + */ + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(mp, dfops, + bma.blkno, bma.length); + if (error) + goto error0; + } } /* Deal with the allocated space we found. */ @@ -4755,6 +4859,219 @@ xfs_bmap_split_indlen( return stolen; } +int +xfs_bmap_del_extent_delay( + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; + int64_t da_old, da_new, da_diff = 0; + xfs_fileoff_t del_endoff, got_endoff; + xfs_filblks_t got_indlen, new_indlen, stolen; + int error = 0, state = 0; + bool isrt; + + XFS_STATS_INC(mp, xs_del_exlist); + + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + da_old = startblockval(got->br_startblock); + da_new = 0; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + + if (isrt) { + int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, rtexts); + } + + /* + * Update the inode delalloc counter now and wait to update the + * sb counters as we might have to borrow some blocks for the + * indirect block accounting. + */ + xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ip->i_delayed_blks -= del->br_blockcount; + + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = got->br_blockcount - del->br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, + got->br_blockcount), da_old); + got->br_startblock = nullstartblock((int)da_new); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + * + * Distribute the original indlen reservation across the two new + * extents. Steal blocks from the deleted extent if necessary. + * Stealing blocks simply fudges the fdblocks accounting below. + * Warn if either of the new indlen reservations is zero as this + * can lead to delalloc problems. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + + got->br_blockcount = del->br_startoff - got->br_startoff; + got_indlen = xfs_bmap_worst_indlen(ip, got->br_blockcount); + + new.br_blockcount = got_endoff - del_endoff; + new_indlen = xfs_bmap_worst_indlen(ip, new.br_blockcount); + + WARN_ON_ONCE(!got_indlen || !new_indlen); + stolen = xfs_bmap_split_indlen(da_old, &got_indlen, &new_indlen, + del->br_blockcount); + + got->br_startblock = nullstartblock((int)got_indlen); + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_state = got->br_state; + new.br_startblock = nullstartblock((int)new_indlen); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + + da_new = got_indlen + new_indlen - stolen; + del->br_blockcount -= stolen; + break; + } + + ASSERT(da_old >= da_new); + da_diff = da_old - da_new; + if (!isrt) + da_diff += del->br_blockcount; + if (da_diff) + xfs_mod_fdblocks(mp, da_diff, false); + return error; +} + +void +xfs_bmap_del_extent_cow( + struct xfs_inode *ip, + xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec new; + xfs_fileoff_t del_endoff, got_endoff; + int state = BMAP_COWFORK; + + XFS_STATS_INC(mp, xs_del_exlist); + + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got->br_startoff + got->br_blockcount; + + ASSERT(*idx >= 0); + ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(del->br_blockcount > 0); + ASSERT(got->br_startoff <= del->br_startoff); + ASSERT(got_endoff >= del_endoff); + ASSERT(!isnullstartblock(got->br_startblock)); + + if (got->br_startoff == del->br_startoff) + state |= BMAP_LEFT_CONTIG; + if (got_endoff == del_endoff) + state |= BMAP_RIGHT_CONTIG; + + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, state); + --*idx; + break; + case BMAP_LEFT_CONTIG: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_startoff = del_endoff; + got->br_blockcount -= del->br_blockcount; + got->br_startblock = del->br_startblock + del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case BMAP_RIGHT_CONTIG: + /* + * Deleting the last part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount -= del->br_blockcount; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + case 0: + /* + * Deleting the middle of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + got->br_blockcount = del->br_startoff - got->br_startoff; + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + new.br_startoff = del_endoff; + new.br_blockcount = got_endoff - del_endoff; + new.br_state = got->br_state; + new.br_startblock = del->br_startblock + del->br_blockcount; + + ++*idx; + xfs_iext_insert(ip, *idx, 1, &new, state); + break; + } +} + /* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). @@ -4768,7 +5085,8 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int bflags) /* bmapi flags */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -4797,6 +5115,8 @@ xfs_bmap_del_extent( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / @@ -4877,6 +5197,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); --*idx; @@ -5060,9 +5381,16 @@ xfs_bmap_del_extent( /* * If we need to, add to list of extents to delete. */ - if (do_fx) - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { + error = xfs_refcount_decrease_extent(mp, dfops, del); + if (error) + goto done; + } else + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); + } + /* * Adjust inode # blocks in the file. */ @@ -5071,7 +5399,7 @@ xfs_bmap_del_extent( /* * Adjust quota data. */ - if (qfield) + if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); /* @@ -5093,17 +5421,16 @@ done: * *done is set. */ int /* error */ -xfs_bunmapi( +__xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ + xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - struct xfs_defer_ops *dfops, /* i/o: list extents to free */ - int *done) /* set if not done yet */ + struct xfs_defer_ops *dfops) /* i/o: deferred updates */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ @@ -5125,11 +5452,12 @@ xfs_bunmapi( int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); if (unlikely( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5151,7 +5479,7 @@ xfs_bunmapi( return error; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); if (nextents == 0) { - *done = 1; + *rlen = 0; return 0; } XFS_STATS_INC(mp, xs_blk_unmap); @@ -5396,7 +5724,7 @@ xfs_bunmapi( cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork); + &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; if (error) goto error0; @@ -5422,7 +5750,10 @@ nodelete: extno++; } } - *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + *rlen = 0; + else + *rlen = bno - start + 1; /* * Convert to a btree if necessary. @@ -5478,6 +5809,27 @@ error0: return error; } +/* Unmap a range of a file. */ +int +xfs_bunmapi( + xfs_trans_t *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_extnum_t nexts, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops, + int *done) +{ + int error; + + error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock, + dfops); + *done = (len == 0); + return error; +} + /* * Determine whether an extent shift can be accomplished by a merge with the * extent that precedes the target hole of the shift. @@ -6057,3 +6409,146 @@ out: xfs_trans_cancel(tp); return error; } + +/* Deferred mapping is only for real extents in the data fork. */ +static bool +xfs_bmap_is_update_needed( + struct xfs_bmbt_irec *bmap) +{ + return bmap->br_startblock != HOLESTARTBLOCK && + bmap->br_startblock != DELAYSTARTBLOCK; +} + +/* Record a bmap intent. */ +static int +__xfs_bmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + int error; + struct xfs_bmap_intent *bi; + + trace_xfs_bmap_defer(mp, + XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + ip->i_ino, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&bi->bi_list); + bi->bi_type = type; + bi->bi_owner = ip; + bi->bi_whichfork = whichfork; + bi->bi_bmap = *bmap; + + error = xfs_defer_join(dfops, bi->bi_owner); + if (error) { + kmem_free(bi); + return error; + } + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_bmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip, + XFS_DATA_FORK, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_bmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip, + XFS_DATA_FORK, PREV); +} + +/* + * Process one of the deferred bmap operations. We pass back the + * btree cursor to maintain our lock on the bmapbt between calls. + */ +int +xfs_bmap_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + struct xfs_bmbt_irec bmap; + int nimaps = 1; + xfs_fsblock_t firstfsb; + int flags = XFS_BMAPI_REMAP; + int done; + int error = 0; + + bmap.br_startblock = startblock; + bmap.br_startoff = startoff; + bmap.br_blockcount = blockcount; + bmap.br_state = state; + + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), + ip->i_ino, whichfork, startoff, blockcount, state); + + if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + return -EFSCORRUPTED; + if (whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; + + if (XFS_TEST_ERROR(false, tp->t_mountp, + XFS_ERRTAG_BMAP_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE)) + return -EIO; + + switch (type) { + case XFS_BMAP_MAP: + firstfsb = bmap.br_startblock; + error = xfs_bmapi_write(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, &firstfsb, + bmap.br_blockcount, &bmap, &nimaps, + dfops); + break; + case XFS_BMAP_UNMAP: + error = xfs_bunmapi(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, 1, &firstfsb, + dfops, &done); + ASSERT(done); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 254034f..7cae6ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -54,7 +54,7 @@ struct xfs_bmalloca { bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ - char userdata;/* userdata mask */ + int datatype;/* data type being allocated */ int flags; }; @@ -97,6 +97,19 @@ struct xfs_extent_free_item */ #define XFS_BMAPI_ZERO 0x080 +/* + * Map the inode offset to the block given in ap->firstblock. Primarily + * used for reflink. The range must be in a hole, and this flag cannot be + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. + * + * For bunmapi, this flag unmaps the range without adjusting quota, reducing + * refcount, or freeing the blocks. + */ +#define XFS_BMAPI_REMAP 0x100 + +/* Map something in the CoW fork. */ +#define XFS_BMAPI_COWFORK 0x200 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -105,12 +118,24 @@ struct xfs_extent_free_item { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_ZERO, "ZERO" } + { XFS_BMAPI_ZERO, "ZERO" }, \ + { XFS_BMAPI_REMAP, "REMAP" }, \ + { XFS_BMAPI_COWFORK, "COWFORK" } static inline int xfs_bmapi_aflag(int w) { - return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : + (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); +} + +static inline int xfs_bmapi_whichfork(int bmapi_flags) +{ + if (bmapi_flags & XFS_BMAPI_COWFORK) + return XFS_COW_FORK; + else if (bmapi_flags & XFS_BMAPI_ATTRFORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; } /* @@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w) #define BMAP_LEFT_VALID (1 << 6) #define BMAP_RIGHT_VALID (1 << 7) #define BMAP_ATTRFORK (1 << 8) +#define BMAP_COWFORK (1 << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ - { BMAP_ATTRFORK, "ATTR" } + { BMAP_ATTRFORK, "ATTR" }, \ + { BMAP_COWFORK, "COW" } /* @@ -163,6 +190,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, #define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif +void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, + xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, @@ -181,18 +210,24 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); -int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, - xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, struct xfs_defer_ops *dfops); +int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); +int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, + xfs_extnum_t *idx, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *del); +void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); @@ -202,5 +237,35 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +struct xfs_bmbt_rec_host * + xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, + int fork, int *eofp, xfs_extnum_t *lastxp, + struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t aoff, xfs_filblks_t len, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, int eof); + +enum xfs_bmap_intent_type { + XFS_BMAP_MAP = 1, + XFS_BMAP_UNMAP, +}; + +struct xfs_bmap_intent { + struct list_head bi_list; + enum xfs_bmap_intent_type bi_type; + struct xfs_inode *bi_owner; + int bi_whichfork; + struct xfs_bmbt_irec bi_bmap; +}; + +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, enum xfs_bmap_intent_type type, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); +int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cd85274..8007d2b 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,6 +453,7 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; /* * Make sure there is sufficient room left in the AG to @@ -482,6 +483,22 @@ xfs_bmbt_alloc_block( if (error) goto error0; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + cur->bc_private.b.dfops->dop_low = true; + args.fsbno = cur->bc_private.b.firstblock; + goto try_another_ag; + } + if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy @@ -777,6 +794,7 @@ xfs_bmbt_init_cursor( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0856979..0e80993 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone; */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC }, + XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, + XFS_REFC_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -1216,6 +1217,9 @@ xfs_btree_set_refs( case XFS_BTNUM_RMAP: xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); break; + case XFS_BTNUM_REFC: + xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); + break; default: ASSERT(0); } @@ -2070,7 +2074,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp0, bool force_all) { - union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ @@ -2086,7 +2090,7 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp0); - lkey = (union xfs_btree_key *)&key; + lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { @@ -3226,7 +3230,7 @@ xfs_btree_insrec( struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ @@ -3241,7 +3245,7 @@ xfs_btree_insrec( XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; - lkey = (union xfs_btree_key *)&nkey; + lkey = &nkey; /* * If we have an external root pointer, and we've made it to the @@ -3444,14 +3448,14 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ - union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; - key = (union xfs_btree_key *)&bkey; + key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); @@ -4797,3 +4801,50 @@ xfs_btree_query_range( return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, fn, priv); } + +/* + * Calculate the number of blocks needed to store a given number of records + * in a short-format (per-AG metadata) btree. + */ +xfs_extlen_t +xfs_btree_calc_size( + struct xfs_mount *mp, + uint *limits, + unsigned long long len) +{ + int level; + int maxrecs; + xfs_extlen_t rval; + + maxrecs = limits[0]; + for (level = 0, rval = 0; len > 1; level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + maxrecs = limits[1]; + rval += len; + } + return rval; +} + +static int +xfs_btree_count_blocks_helper( + struct xfs_btree_cur *cur, + int level, + void *data) +{ + xfs_extlen_t *blocks = data; + (*blocks)++; + + return 0; +} + +/* Count the blocks in a btree and return the result in *blocks. */ +int +xfs_btree_count_blocks( + struct xfs_btree_cur *cur, + xfs_extlen_t *blocks) +{ + *blocks = 0; + return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, + blocks); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 04d0865..c2b01d1 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -37,30 +37,19 @@ union xfs_btree_ptr { __be64 l; /* long form ptr */ }; -union xfs_btree_key { - struct xfs_bmbt_key bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - struct xfs_inobt_key inobt; - struct xfs_rmap_key rmap; -}; - /* - * In-core key that holds both low and high keys for overlapped btrees. - * The two keys are packed next to each other on disk, so do the same - * in memory. Preserve the existing xfs_btree_key as a single key to - * avoid the mental model breakage that would happen if we passed a - * bigkey into a function that operates on a single key. + * The in-core btree key. Overlapping btrees actually store two keys + * per pointer, so we reserve enough memory to hold both. The __*bigkey + * items should never be accessed directly. */ -union xfs_btree_bigkey { +union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; - struct { - struct xfs_rmap_key rmap; - struct xfs_rmap_key rmap_hi; - }; + struct xfs_rmap_key rmap; + struct xfs_rmap_key __rmap_bigkey[2]; + struct xfs_refcount_key refc; }; union xfs_btree_rec { @@ -69,6 +58,7 @@ union xfs_btree_rec { struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; + struct xfs_refcount_rec refc; }; /* @@ -84,6 +74,7 @@ union xfs_btree_rec { #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) +#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) /* * For logging record fields. @@ -117,6 +108,7 @@ do { \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ + case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -139,6 +131,8 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ case XFS_BTNUM_RMAP: \ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ + case XFS_BTNUM_REFC: \ + __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -229,6 +223,15 @@ union xfs_btree_irec { struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; + struct xfs_refcount_irec rc; +}; + +/* Per-AG btree private information. */ +union xfs_btree_cur_private { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; }; /* @@ -255,6 +258,7 @@ typedef struct xfs_btree_cur struct xfs_buf *agbp; /* agf/agi buffer pointer */ struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ + union xfs_btree_cur_private priv; } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ @@ -513,6 +517,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, + unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -529,4 +535,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, void *data); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c221d0e..613c5cf 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -81,6 +81,10 @@ * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. + * * If the result of doing the work was -EAGAIN, ->finish work + * wants a new transaction. See the "Requesting a Fresh + * Transaction while Finishing Deferred Work" section below for + * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log @@ -88,6 +92,34 @@ * we can perform complex remapping operations, chaining intent items * as needed. * + * Requesting a Fresh Transaction while Finishing Deferred Work + * + * If ->finish_item decides that it needs a fresh transaction to + * finish the work, it must ask its caller (xfs_defer_finish) for a + * continuation. The most likely cause of this circumstance are the + * refcount adjust functions deciding that they've logged enough items + * to be at risk of exceeding the transaction reservation. + * + * To get a fresh transaction, we want to log the existing log done + * item to prevent the log intent item from replaying, immediately log + * a new log intent item with the unfinished work items, roll the + * transaction, and re-call ->finish_item wherever it left off. The + * log done item and the new log intent item must be in the same + * transaction or atomicity cannot be guaranteed; defer_finish ensures + * that this happens. + * + * This requires some coordination between ->finish_item and + * defer_finish. Upon deciding to request a new transaction, + * ->finish_item should update the current work item to reflect the + * unfinished work. Next, it should reset the log done item's list + * count to the number of items finished, and return -EAGAIN. + * defer_finish sees the -EAGAIN, logs the new log intent item + * with the remaining work items, and leaves the xfs_defer_pending + * item at the head of the dop_work queue. Then it rolls the + * transaction and picks up processing where it left off. It is + * required that ->finish_item must be careful to leave enough + * transaction reservation to fit the new log intent item. + * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: @@ -104,21 +136,26 @@ * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 - * | Done reducing refcount for extent (C, B) | + * | Done reducing refcount for extent (C, 9) | + * | Intent to reduce refcount for extent (C+9, B-9) | + * | (ran out of space after 9 refcount updates) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C+9, B+9) | t3 + * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Remove rmap (X, C, A, B) | t3 + * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Free extent (C, B) | t4 + * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | @@ -141,6 +178,9 @@ * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) + * + * Note that the continuation requested between t2 and t3 is likely to + * reoccur. */ static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; @@ -323,7 +363,16 @@ xfs_defer_finish( dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, dfp->dfp_done, &state); - if (error) { + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; + * put the work item back on the list + * and jump out. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + break; + } else if (error) { /* * Clean up after ourselves and jump out. * xfs_defer_cancel will take care of freeing @@ -335,9 +384,25 @@ xfs_defer_finish( goto out; } } - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction, so log a + * new log intent item to replace the old one + * and roll the transaction. See "Requesting + * a Fresh Transaction while Finishing + * Deferred Work" above. + */ + dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_count); + dfp->dfp_done = NULL; + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(*tp, dfp->dfp_intent, + li); + } else { + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + } if (cleanup_fn) cleanup_fn(*tp, state, error); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index e96533d..f6e93ef 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -51,6 +51,8 @@ struct xfs_defer_pending { * find all the space it needs. */ enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_BMAP, + XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_MAX, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 3cc3cf7..ac9a003 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk( - XFS_BB_TO_FSB(mp, bp->b_length)); + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 270fb5c..6b7579e 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -456,9 +456,11 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ - XFS_SB_FEAT_RO_COMPAT_RMAPBT) + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ + XFS_SB_FEAT_RO_COMPAT_REFLINK) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); } +static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); +} + /* * end of superblock version macros */ @@ -641,14 +649,17 @@ typedef struct xfs_agf { uuid_t agf_uuid; /* uuid of filesystem */ __be32 agf_rmap_blocks; /* rmapbt blocks used */ - __be32 agf_padding; /* padding */ + __be32 agf_refcount_blocks; /* refcountbt blocks used */ + + __be32 agf_refcount_root; /* refcount tree root block */ + __be32 agf_refcount_level; /* refcount btree levels */ /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[15]; + __be64 agf_spare64[14]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -674,8 +685,11 @@ typedef struct xfs_agf { #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 #define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_SPARE64 0x00004000 -#define XFS_AGF_NUM_BITS 15 +#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 +#define XFS_AGF_REFCOUNT_ROOT 0x00008000 +#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 +#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_NUM_BITS 18 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -693,6 +707,9 @@ typedef struct xfs_agf { { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ { XFS_AGF_UUID, "UUID" }, \ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \ + { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \ { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ @@ -848,7 +865,6 @@ typedef struct xfs_timestamp { * padding field for v3 inodes. */ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) typedef struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ @@ -885,7 +901,8 @@ typedef struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ + __be32 di_cowextsize; /* basic cow extent size for file */ + __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_timestamp_t di_crtime; /* time created */ @@ -1041,9 +1058,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * 16 bits of the XFS_XFLAG_s range. */ #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) +#define XFS_DIFLAG2_ANY \ + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) /* * Inode number format: @@ -1353,7 +1375,9 @@ struct xfs_owner_info { #define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ #define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ #define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ -#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ +#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */ +#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */ +#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */ #define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) @@ -1434,6 +1458,62 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Reference Count Btree format definitions + * + */ +#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */ + +unsigned int xfs_refc_block(struct xfs_mount *mp); + +/* + * Data record/key structure + * + * Each record associates a range of physical blocks (starting at + * rc_startblock and ending rc_blockcount blocks later) with a reference + * count (rc_refcount). Extents that are being used to stage a copy on + * write (CoW) operation are recorded in the refcount btree with a + * refcount of 1. All other records must have a refcount > 1 and must + * track an extent mapped only by file data forks. + * + * Extents with a single owner (attributes, metadata, non-shared file + * data) are not tracked here. Free space is also not tracked here. + * This is consistent with pre-reflink XFS. + */ + +/* + * Extents that are being used to stage a copy on write are stored + * in the refcount btree with a refcount of 1 and the upper bit set + * on the startblock. This speeds up mount time deletion of stale + * staging extents because they're all at the right side of the tree. + */ +#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define REFCNTBT_COWFLAG_BITLEN 1 +#define REFCNTBT_AGBLOCK_BITLEN 31 + +struct xfs_refcount_rec { + __be32 rc_startblock; /* starting block number */ + __be32 rc_blockcount; /* count of blocks */ + __be32 rc_refcount; /* number of inodes linked here */ +}; + +struct xfs_refcount_key { + __be32 rc_startblock; /* starting block number */ +}; + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ +}; + +#define MAXREFCOUNT ((xfs_nlink_t)~0U) +#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) + +/* btree pointer type */ +typedef __be32 xfs_refcount_ptr_t; + + +/* * BMAP Btree format definitions * * This includes both the root block definition that sits inside an inode fork diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 7945505..b72dc82 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -81,14 +81,16 @@ struct getbmapx { #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */ #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ - BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ +#define BMV_OF_SHARED 0x8 /* segment shared with another file */ /* * Structure for XFS_IOC_FSSETDM. @@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -275,7 +278,8 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[10]; /* pad space, unused */ + unsigned char bs_pad[6]; /* pad space, unused */ + __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 31ca220..eab68ae 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -132,7 +132,7 @@ xfs_inobt_free_block( xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo); + &oinfo, XFS_AG_RESV_NONE); } STATIC int diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4b9769e..134424f 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -57,6 +57,17 @@ xfs_inobp_check( } #endif +bool +xfs_dinode_good_version( + struct xfs_mount *mp, + __u8 version) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return version == 3; + + return version == 1 || version == 2; +} + /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -91,7 +102,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - XFS_DINODE_GOOD_VERSION(dip->di_version); + xfs_dinode_good_version(mp, dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, XFS_RANDOM_ITOBP_INOTOBP))) { @@ -256,6 +267,7 @@ xfs_inode_from_disk( to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } } @@ -305,7 +317,7 @@ xfs_inode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); - + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -357,6 +369,7 @@ xfs_log_dinode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); @@ -373,6 +386,9 @@ xfs_dinode_verify( struct xfs_inode *ip, struct xfs_dinode *dip) { + uint16_t flags; + uint64_t flags2; + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; @@ -389,6 +405,23 @@ xfs_dinode_verify( return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; + + flags = be16_to_cpu(dip->di_flags); + flags2 = be64_to_cpu(dip->di_flags2); + + /* don't allow reflink/cowextsize if we don't have reflink */ + if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + + /* don't let reflink and realtime mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + return false; + + /* don't let reflink and dax mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) + return false; + return true; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7c4dd32..3cfe12a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -47,6 +47,7 @@ struct xfs_icdinode { __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint64_t di_flags2; /* more random flags */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; @@ -73,6 +74,8 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); +bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); + #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bbcc8c7..5dd56d3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -121,6 +121,26 @@ xfs_iformat_fork( return -EFSCORRUPTED; } + if (unlikely(xfs_is_reflink_inode(ip) && + (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, wrong file type for reflink.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(xfs_is_reflink_inode(ip) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, has reflink+realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: @@ -186,9 +206,14 @@ xfs_iformat_fork( XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } - if (error) { + if (error) return error; + + if (xfs_is_reflink_inode(ip)) { + ASSERT(ip->i_cowfp == NULL); + xfs_ifork_init_cow(ip); } + if (!XFS_DFORK_Q(dip)) return 0; @@ -208,7 +233,8 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(8)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -226,6 +252,9 @@ xfs_iformat_fork( if (error) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + if (ip->i_cowfp) + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } return error; @@ -740,6 +769,9 @@ xfs_idestroy_fork( if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + } else if (whichfork == XFS_COW_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; } } @@ -927,6 +959,19 @@ xfs_iext_get_ext( } } +/* Convert bmap state flags to an inode fork. */ +struct xfs_ifork * +xfs_iext_state_to_fork( + struct xfs_inode *ip, + int state) +{ + if (state & BMAP_COWFORK) + return ip->i_cowfp; + else if (state & BMAP_ATTRFORK) + return ip->i_afp; + return &ip->i_df; +} + /* * Insert new item(s) into the extent records for incore inode * fork 'ifp'. 'count' new items are inserted at index 'idx'. @@ -939,7 +984,7 @@ xfs_iext_insert( xfs_bmbt_irec_t *new, /* items to insert */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t i; /* extent record index */ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); @@ -1189,7 +1234,7 @@ xfs_iext_remove( int ext_diff, /* number of extents to remove */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t nextents; /* number of extents in file */ int new_size; /* size of extents after removal */ @@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Initialize an inode's copy-on-write fork. + */ +void +xfs_ifork_init_cow( + struct xfs_inode *ip) +{ + if (ip->i_cowfp) + return; + + ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, + KM_SLEEP | KM_NOFS); + ip->i_cowfp->if_flags = XFS_IFEXTENTS; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; + ip->i_cnextents = 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index f95e072..c9476f5 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -92,7 +92,9 @@ typedef struct xfs_ifork { #define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ - (ip)->i_afp) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_afp : \ + (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ @@ -105,26 +107,38 @@ typedef struct xfs_ifork { #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) + ((w) == XFS_ATTR_FORK ? \ + XFS_IFORK_ASIZE(ip) : \ + 0)) #define XFS_IFORK_FORMAT(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_aformat : \ + (ip)->i_cformat)) #define XFS_IFORK_FMT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_aformat = (n)) : \ + ((ip)->i_cformat = (n)))) #define XFS_IFORK_NEXTENTS(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_anextents : \ + (ip)->i_cnextents)) #define XFS_IFORK_NEXT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_anextents = (n)) : \ + ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); @@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); extern struct kmem_zone *xfs_ifork_zone; +extern void xfs_ifork_init_cow(struct xfs_inode *ip); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a6eed43..083cdd6 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_MAX 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_MAX 26 /* * Flags to log operation header @@ -231,6 +235,10 @@ typedef struct xfs_trans_header { #define XFS_LI_ICREATE 0x123f #define XFS_LI_RUI 0x1240 /* rmap update intent */ #define XFS_LI_RUD 0x1241 +#define XFS_LI_CUI 0x1242 /* refcount update intent */ +#define XFS_LI_CUD 0x1243 +#define XFS_LI_BUI 0x1244 /* bmbt update intent */ +#define XFS_LI_BUD 0x1245 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -242,7 +250,11 @@ typedef struct xfs_trans_header { { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ { XFS_LI_RUI, "XFS_LI_RUI" }, \ - { XFS_LI_RUD, "XFS_LI_RUD" } + { XFS_LI_RUD, "XFS_LI_RUD" }, \ + { XFS_LI_CUI, "XFS_LI_CUI" }, \ + { XFS_LI_CUD, "XFS_LI_CUD" }, \ + { XFS_LI_BUI, "XFS_LI_BUI" }, \ + { XFS_LI_BUD, "XFS_LI_BUD" } /* * Inode Log Item Format definitions. @@ -411,7 +423,8 @@ struct xfs_log_dinode { __uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ __uint64_t di_flags2; /* more random flags */ - __uint8_t di_pad2[16]; /* more padding for future expansion */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ + __uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -622,8 +635,11 @@ struct xfs_map_extent { /* rmap me_flags: upper bits are flags, lower byte is type code */ #define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_MAP_SHARED 2 #define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_UNMAP_SHARED 4 #define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_CONVERT_SHARED 6 #define XFS_RMAP_EXTENT_ALLOC 7 #define XFS_RMAP_EXTENT_FREE 8 #define XFS_RMAP_EXTENT_TYPE_MASK 0xFF @@ -647,9 +663,17 @@ struct xfs_rui_log_format { __uint16_t rui_size; /* size of this item */ __uint32_t rui_nextents; /* # extents to free */ __uint64_t rui_id; /* rui identifier */ - struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ + struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; +static inline size_t +xfs_rui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_rui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + /* * This is the structure used to lay out an rud log item in the * log. The rud_extents array is a variable size array whose @@ -663,6 +687,102 @@ struct xfs_rud_log_format { }; /* + * CUI/CUD (refcount update) log format definitions + */ +struct xfs_phys_extent { + __uint64_t pe_startblock; + __uint32_t pe_len; + __uint32_t pe_flags; +}; + +/* refcount pe_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_refcount_intent_type. */ +#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF + +#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK) + +/* + * This is the structure used to lay out a cui log item in the + * log. The cui_extents field is a variable size array whose + * size is given by cui_nextents. + */ +struct xfs_cui_log_format { + __uint16_t cui_type; /* cui log item type */ + __uint16_t cui_size; /* size of this item */ + __uint32_t cui_nextents; /* # extents to free */ + __uint64_t cui_id; /* cui identifier */ + struct xfs_phys_extent cui_extents[]; /* array of extents */ +}; + +static inline size_t +xfs_cui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_cui_log_format) + + nr * sizeof(struct xfs_phys_extent); +} + +/* + * This is the structure used to lay out a cud log item in the + * log. The cud_extents array is a variable size array whose + * size is given by cud_nextents; + */ +struct xfs_cud_log_format { + __uint16_t cud_type; /* cud log item type */ + __uint16_t cud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t cud_cui_id; /* id of corresponding cui */ +}; + +/* + * BUI/BUD (inode block mapping) log format definitions + */ + +/* bmbt me_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_bmap_intent_type. */ +#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) + +#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ + XFS_BMAP_EXTENT_ATTR_FORK | \ + XFS_BMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an bui log item in the + * log. The bui_extents field is a variable size array whose + * size is given by bui_nextents. + */ +struct xfs_bui_log_format { + __uint16_t bui_type; /* bui log item type */ + __uint16_t bui_size; /* size of this item */ + __uint32_t bui_nextents; /* # extents to free */ + __uint64_t bui_id; /* bui identifier */ + struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ +}; + +static inline size_t +xfs_bui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_bui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + +/* + * This is the structure used to lay out an bud log item in the + * log. The bud_extents array is a variable size array whose + * size is given by bud_nextents; + */ +struct xfs_bud_log_format { + __uint16_t bud_type; /* bud log item type */ + __uint16_t bud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t bud_bui_id; /* id of corresponding bui */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c new file mode 100644 index 0000000..b177ef3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_refcount.h" +#include "xfs_rmap.h" + +/* Allowable refcount adjustment amounts. */ +enum xfs_refc_adjust_op { + XFS_REFCOUNT_ADJUST_INCREASE = 1, + XFS_REFCOUNT_ADJUST_DECREASE = -1, + XFS_REFCOUNT_ADJUST_COW_ALLOC = 0, + XFS_REFCOUNT_ADJUST_COW_FREE = -1, +}; + +STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); +STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); + +/* + * Look up the first record less than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Look up the first record greater than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_ge( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_GE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* Convert on-disk record to in-core format. */ +static inline void +xfs_refcount_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec) +{ + irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); + irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + xfs_refcount_btrec_to_irec(rec, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, + irec); + } + return error; +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_update( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); + rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_refcount_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Insert the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_insert( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *i) +{ + int error; + + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; + cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; + cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + error = xfs_btree_insert(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +out_error: + if (error) + trace_xfs_refcount_insert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Remove the record referred to by cur, then set the pointer to the spot + * where the record could be re-inserted, in case we want to increment or + * decrement the cursor. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_delete( + struct xfs_btree_cur *cur, + int *i) +{ + struct xfs_refcount_irec irec; + int found_rec; + int error; + + error = xfs_refcount_get_rec(cur, &irec, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + error = xfs_btree_delete(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (error) + goto out_error; + error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); +out_error: + if (error) + trace_xfs_refcount_delete_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Adjusting the Reference Count + * + * As stated elsewhere, the reference count btree (refcbt) stores + * >1 reference counts for extents of physical blocks. In this + * operation, we're either raising or lowering the reference count of + * some subrange stored in the tree: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+--------- + * 2 | | 3 | 4 | |17| 55 | 10 + * ----+ +---+-----+ +--+--------+--------- + * X axis is physical blocks number; + * reference counts are the numbers inside the rectangles + * + * The first thing we need to do is to ensure that there are no + * refcount extents crossing either boundary of the range to be + * adjusted. For any extent that does cross a boundary, split it into + * two extents so that we can increment the refcount of one of the + * pieces later: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+----+---- + * 2 | | 3 | 2 | |17| 55 | 10 | 10 + * ----+ +---+-----+ +--+--------+----+---- + * + * For this next step, let's assume that all the physical blocks in + * the adjustment range are mapped to a file and are therefore in use + * at least once. Therefore, we can infer that any gap in the + * refcount tree within the adjustment range represents a physical + * extent with refcount == 1: + * + * <------ adjustment range ------> + * ----+---+---+-----+-+--+--------+----+---- + * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10 + * ----+---+---+-----+-+--+--------+----+---- + * ^ + * + * For each extent that falls within the interval range, figure out + * which extent is to the left or the right of that extent. Now we + * have a left, current, and right extent. If the new reference count + * of the center extent enables us to merge left, center, and right + * into one record covering all three, do so. If the center extent is + * at the left end of the range, abuts the left extent, and its new + * reference count matches the left extent's record, then merge them. + * If the center extent is at the right end of the range, abuts the + * right extent, and the reference counts match, merge those. In the + * example, we can left merge (assuming an increment operation): + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 3 | 2 |1|17| 55 | 10 | 10 + * --------+---+-----+-+--+--------+----+---- + * ^ + * + * For all other extents within the range, adjust the reference count + * or delete it if the refcount falls below 2. If we were + * incrementing, the end result looks like this: + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 4 | 3 |2|18| 56 | 11 | 10 + * --------+---+-----+-+--+--------+----+---- + * + * The result of a decrement operation looks as such: + * + * <------ adjustment range ------> + * ----+ +---+ +--+--------+----+---- + * 2 | | 2 | |16| 54 | 9 | 10 + * ----+ +---+ +--+--------+----+---- + * DDDD 111111DD + * + * The blocks marked "D" are freed; the blocks marked "1" are only + * referenced once and therefore the record is removed from the + * refcount btree. + */ + +/* Next block after this extent. */ +static inline xfs_agblock_t +xfs_refc_next( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock + rc->rc_blockcount; +} + +/* + * Split a refcount extent that crosses agbno. + */ +STATIC int +xfs_refcount_split_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + bool *shape_changed) +{ + struct xfs_refcount_irec rcext, tmp; + int found_rec; + int error; + + *shape_changed = false; + error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &rcext, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) + return 0; + + *shape_changed = true; + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + &rcext, agbno); + + /* Establish the right extent. */ + tmp = rcext; + tmp.rc_startblock = agbno; + tmp.rc_blockcount -= (agbno - rcext.rc_startblock); + error = xfs_refcount_update(cur, &tmp); + if (error) + goto out_error; + + /* Insert the left extent. */ + tmp = rcext; + tmp.rc_blockcount = agbno - rcext.rc_startblock; + error = xfs_refcount_insert(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + return error; + +out_error: + trace_xfs_refcount_split_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge the left, center, and right extents. + */ +STATIC int +xfs_refcount_merge_center_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *center, + struct xfs_refcount_irec *right, + unsigned long long extlen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_center_extents(cur->bc_mp, + cur->bc_private.a.agno, left, center, right); + + /* + * Make sure the center and right extents are not in the btree. + * If the center extent was synthesized, the first delete call + * removes the right extent and we skip the second deletion. + * If center and right were in the btree, then the first delete + * call removes the center and the second one removes the right + * extent. + */ + error = xfs_refcount_lookup_ge(cur, center->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (center->rc_refcount > 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount = extlen; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *aglen = 0; + return error; + +out_error: + trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the left extent. + */ +STATIC int +xfs_refcount_merge_left_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_left_extent(cur->bc_mp, + cur->bc_private.a.agno, left, cleft); + + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ + if (cleft->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount += cleft->rc_blockcount; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *agbno += cleft->rc_blockcount; + *aglen -= cleft->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the right extent. + */ +STATIC int +xfs_refcount_merge_right_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_right_extent(cur->bc_mp, + cur->bc_private.a.agno, cright, right); + + /* + * If the extent ending at agbno+aglen (cright) wasn't synthesized, + * remove it. + */ + if (cright->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cright->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the right extent. */ + error = xfs_refcount_lookup_le(cur, right->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + right->rc_startblock -= cright->rc_blockcount; + right->rc_blockcount += cright->rc_blockcount; + error = xfs_refcount_update(cur, right); + if (error) + goto out_error; + + *aglen -= cright->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#define XFS_FIND_RCEXT_SHARED 1 +#define XFS_FIND_RCEXT_COW 2 +/* + * Find the left extent and the one after it (cleft). This function assumes + * that we've already split any extent crossing agbno. + */ +STATIC int +xfs_refcount_find_left_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (xfs_refc_next(&tmp) != agbno) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a left extent; retrieve (or invent) the next right one */ + *left = tmp; + + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp starts at the end of our range, just use that */ + if (tmp.rc_startblock == agbno) + *cleft = tmp; + else { + /* + * There's a gap in the refcntbt at the start of the + * range we're interested in (refcount == 1) so + * synthesize the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = min(aglen, + tmp.rc_startblock - agbno); + cleft->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = aglen; + cleft->rc_refcount = 1; + } + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + left, cleft, agbno); + return error; + +out_error: + trace_xfs_refcount_find_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find the right extent and the one before it (cright). This function + * assumes that we've already split any extents crossing agbno + aglen. + */ +STATIC int +xfs_refcount_find_right_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (tmp.rc_startblock != agbno + aglen) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a right extent; retrieve (or invent) the next left one */ + *right = tmp; + + error = xfs_btree_decrement(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp ends at the end of our range, just use that */ + if (xfs_refc_next(&tmp) == agbno + aglen) + *cright = tmp; + else { + /* + * There's a gap in the refcntbt at the end of the + * range we're interested in (refcount == 1) so + * create the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cright->rc_startblock = max(agbno, xfs_refc_next(&tmp)); + cright->rc_blockcount = right->rc_startblock - + cright->rc_startblock; + cright->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cright->rc_startblock = agbno; + cright->rc_blockcount = aglen; + cright->rc_refcount = 1; + } + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + cright, right, agbno + aglen); + return error; + +out_error: + trace_xfs_refcount_find_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Is this extent valid? */ +static inline bool +xfs_refc_valid( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock != NULLAGBLOCK; +} + +/* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +STATIC int +xfs_refcount_merge_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adjust, + int flags, + bool *shape_changed) +{ + struct xfs_refcount_irec left = {0}, cleft = {0}; + struct xfs_refcount_irec cright = {0}, right = {0}; + int error; + unsigned long long ulen; + bool cequal; + + *shape_changed = false; + /* + * Find the extent just below agbno [left], just above agbno [cleft], + * just below (agbno + aglen) [cright], and just above (agbno + aglen) + * [right]. + */ + error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, + *aglen, flags); + if (error) + return error; + error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, + *aglen, flags); + if (error) + return error; + + /* No left or right extent to merge; exit. */ + if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right)) + return 0; + + cequal = (cleft.rc_startblock == cright.rc_startblock) && + (cleft.rc_blockcount == cright.rc_blockcount); + + /* Try to merge left, cleft, and right. cleft must == cright. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + + right.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && + xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && + left.rc_refcount == cleft.rc_refcount + adjust && + right.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_center_extents(cur, &left, &cleft, + &right, ulen, agbno, aglen); + } + + /* Try to merge left and cleft. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && + left.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + error = xfs_refcount_merge_left_extent(cur, &left, &cleft, + agbno, aglen); + if (error) + return error; + + /* + * If we just merged left + cleft and cleft == cright, + * we no longer have a cright to merge with right. We're done. + */ + if (cequal) + return 0; + } + + /* Try to merge cright and right. */ + ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; + if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && + right.rc_refcount == cright.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_right_extent(cur, &right, &cright, + agbno, aglen); + } + + return error; +} + +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 24 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + * + * XXX: This is a pretty hand-wavy estimate. The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. + * Be conservative here. + */ +static bool +xfs_refcount_still_have_space( + struct xfs_btree_cur *cur) +{ + unsigned long overhead; + + overhead = cur->bc_private.a.priv.refc.shape_changes * + xfs_allocfree_log_count(cur->bc_mp, 1); + overhead *= cur->bc_mp->m_sb.sb_blocksize; + + /* + * Only allow 2 refcount extent updates per transaction if the + * refcount continue update "error" has been injected. + */ + if (cur->bc_private.a.priv.refc.nr_ops > 2 && + XFS_TEST_ERROR(false, cur->bc_mp, + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + return false; + + if (cur->bc_private.a.priv.refc.nr_ops == 0) + return true; + else if (overhead > cur->bc_tp->t_log_res) + return false; + return cur->bc_tp->t_log_res - overhead > + cur->bc_private.a.priv.refc.nr_ops * 32; +} + +/* + * Adjust the refcounts of middle extents. At this point we should have + * split extents that crossed the adjustment range; merged with adjacent + * extents; and updated agbno/aglen to reflect the merges. Therefore, + * all we have to do is update the extents inside [agbno, agbno + aglen]. + */ +STATIC int +xfs_refcount_adjust_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + xfs_fsblock_t fsbno; + + /* Merging did all the work already. */ + if (*aglen == 0) + return 0; + + error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + if (error) + goto out_error; + + while (*aglen > 0 && xfs_refcount_still_have_space(cur)) { + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + /* + * Deal with a hole in the refcount tree; if a file maps to + * these blocks and there's no refcountbt record, pretend that + * there is one with refcount == 1. + */ + if (ext.rc_startblock != *agbno) { + tmp.rc_startblock = *agbno; + tmp.rc_blockcount = min(*aglen, + ext.rc_startblock - *agbno); + tmp.rc_refcount = 1 + adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + /* + * Either cover the hole (increment) or + * delete the range (decrement). + */ + if (tmp.rc_refcount) { + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + tmp.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + tmp.rc_blockcount, oinfo); + } + + (*agbno) += tmp.rc_blockcount; + (*aglen) -= tmp.rc_blockcount; + + error = xfs_refcount_lookup_ge(cur, *agbno, + &found_rec); + if (error) + goto out_error; + } + + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* + * Adjust the reference count and either update the tree + * (incr) or free the blocks (decr). + */ + if (ext.rc_refcount == MAXREFCOUNT) + goto skip; + ext.rc_refcount += adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + if (ext.rc_refcount > 1) { + error = xfs_refcount_update(cur, &ext); + if (error) + goto out_error; + cur->bc_private.a.priv.refc.nr_ops++; + } else if (ext.rc_refcount == 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + goto advloop; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + ext.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + ext.rc_blockcount, oinfo); + } + +skip: + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + +advloop: + (*agbno) += ext.rc_blockcount; + (*aglen) -= ext.rc_blockcount; + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Adjust the reference count of a range of AG blocks. */ +STATIC int +xfs_refcount_adjust( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *new_agbno, + xfs_extlen_t *new_aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + bool shape_changed; + int shape_changes = 0; + int error; + + *new_agbno = agbno; + *new_aglen = aglen; + if (adj == XFS_REFCOUNT_ADJUST_INCREASE) + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + else + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, + XFS_FIND_RCEXT_SHARED, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + if (shape_changes) + cur->bc_private.a.priv.refc.shape_changes++; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, + adj, dfops, oinfo); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* Clean up after calling xfs_refcount_finish_one. */ +void +xfs_refcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_refcount_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_agblock_t new_agbno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_REFCOUNT_FINISH_ONE, + XFS_RANDOM_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + nr_ops = rcur->bc_private.a.priv.refc.nr_ops; + shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + xfs_refcount_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + XFS_ALLOC_FLAG_FREEING, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + rcur->bc_private.a.priv.refc.nr_ops = nr_ops; + rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + } + *pcur = rcur; + + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_ALLOC_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops); + break; + case XFS_REFCOUNT_FREE_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (!error && *new_len > 0) + trace_xfs_refcount_finish_one_leftover(mp, agno, type, + bno, blockcount, new_agbno, *new_len); + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Record a refcount intent for later processing. + */ +static int +__xfs_refcount_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount) +{ + struct xfs_refcount_intent *ri; + + trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + ri = kmem_alloc(sizeof(struct xfs_refcount_intent), + KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_startblock = startblock; + ri->ri_blockcount = blockcount; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + return 0; +} + +/* + * Increase the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_increase_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Decrease the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_decrease_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is set, return the longest contiguous extent of + * shared blocks; if not, just return the first extent we find. If no + * shared blocks are found, fbno and flen will be set to NULLAGBLOCK + * and 0, respectively. + */ +int +xfs_refcount_find_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_refcount_irec tmp; + int i; + int have; + int error; + + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* By default, skip the whole range */ + *fbno = NULLAGBLOCK; + *flen = 0; + + /* Try to find a refcount extent that crosses the start */ + error = xfs_refcount_lookup_le(cur, agbno, &have); + if (error) + goto out_error; + if (!have) { + /* No left extent, look at the next one */ + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + } + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + + /* If the extent ends before the start, look at the next one */ + if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + } + + /* If the extent starts after the range we want, bail out */ + if (tmp.rc_startblock >= agbno + aglen) + goto done; + + /* We found the start of a shared extent! */ + if (tmp.rc_startblock < agbno) { + tmp.rc_blockcount -= (agbno - tmp.rc_startblock); + tmp.rc_startblock = agbno; + } + + *fbno = tmp.rc_startblock; + *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno); + if (!find_end_of_shared) + goto done; + + /* Otherwise, find the end of this shared extent */ + while (*fbno + *flen < agbno + aglen) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + break; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (tmp.rc_startblock >= agbno + aglen || + tmp.rc_startblock != *fbno + *flen) + break; + *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); + } + +done: + trace_xfs_refcount_find_shared_result(cur->bc_mp, + cur->bc_private.a.agno, *fbno, *flen); + +out_error: + if (error) + trace_xfs_refcount_find_shared_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Recovering CoW Blocks After a Crash + * + * Due to the way that the copy on write mechanism works, there's a window of + * opportunity in which we can lose track of allocated blocks during a crash. + * Because CoW uses delayed allocation in the in-core CoW fork, writeback + * causes blocks to be allocated and stored in the CoW fork. The blocks are + * no longer in the free space btree but are not otherwise recorded anywhere + * until the write completes and the blocks are mapped into the file. A crash + * in between allocation and remapping results in the replacement blocks being + * lost. This situation is exacerbated by the CoW extent size hint because + * allocations can hang around for long time. + * + * However, there is a place where we can record these allocations before they + * become mappings -- the reference count btree. The btree does not record + * extents with refcount == 1, so we can record allocations with a refcount of + * 1. Blocks being used for CoW writeout cannot be shared, so there should be + * no conflict with shared block records. These mappings should be created + * when we allocate blocks to the CoW fork and deleted when they're removed + * from the CoW fork. + * + * Minor nit: records for in-progress CoW allocations and records for shared + * extents must never be merged, to preserve the property that (except for CoW + * allocations) there are no refcount btree entries with refcount == 1. The + * only time this could potentially happen is when unsharing a block that's + * adjacent to CoW allocations, so we must be careful to avoid this. + * + * At mount time we recover lost CoW allocations by searching the refcount + * btree for these refcount == 1 mappings. These represent CoW allocations + * that were in progress at the time the filesystem went down, so we can free + * them to get the space back. + * + * This mechanism is superior to creating EFIs for unmapped CoW extents for + * several reasons -- first, EFIs pin the tail of the log and would have to be + * periodically relogged to avoid filling up the log. Second, CoW completions + * will have to file an EFD and create new EFIs for whatever remains in the + * CoW fork; this partially takes care of (1) but extent-size reservations + * will have to periodically relog even if there's no writeout in progress. + * This can happen if the CoW extent size hint is set, which you really want. + * Third, EFIs cannot currently be automatically relogged into newer + * transactions to advance the log tail. Fourth, stuffing the log full of + * EFIs places an upper bound on the number of CoW allocations that can be + * held filesystem-wide at any given time. Recording them in the refcount + * btree doesn't require us to maintain any state in memory and doesn't pin + * the log. + */ +/* + * Adjust the refcounts of CoW allocations. These allocations are "magic" + * in that they're not referenced anywhere else in the filesystem, so we + * stash them in the refcount btree with a refcount of 1 until either file + * remapping (or CoW cancellation) happens. + */ +STATIC int +xfs_refcount_adjust_cow_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + + if (aglen == 0) + return 0; + + /* Find any overlapping refcount records */ + error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + if (error) + goto out_error; + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + + XFS_REFC_COW_START; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + switch (adj) { + case XFS_REFCOUNT_ADJUST_COW_ALLOC: + /* Adding a CoW reservation, there should be nothing here. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock >= agbno + aglen, out_error); + + tmp.rc_startblock = agbno; + tmp.rc_blockcount = aglen; + tmp.rc_refcount = 1; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + break; + case XFS_REFCOUNT_ADJUST_COW_FREE: + /* Removing a CoW reservation, there should be one extent. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock == agbno, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_blockcount == aglen, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_refcount == 1, out_error); + + ext.rc_refcount = 0; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + break; + default: + ASSERT(0); + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Add or remove refcount btree entries for CoW reservations. + */ +STATIC int +xfs_refcount_adjust_cow( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops) +{ + bool shape_changed; + int error; + + agbno += XFS_REFC_COW_START; + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, + XFS_FIND_RCEXT_COW, &shape_changed); + if (error) + goto out_error; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, + dfops, NULL); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Record a CoW allocation in the refcount btree. + */ +STATIC int +__xfs_refcount_cow_alloc( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Add refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + if (error) + return error; + + /* Add rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* + * Remove a CoW allocation from the refcount btree. + */ +STATIC int +__xfs_refcount_cow_free( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Remove refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + if (error) + return error; + + /* Remove rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_free_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* Record a CoW staging extent in the refcount btree. */ +int +xfs_refcount_alloc_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + fsb, len); +} + +/* Forget a CoW staging event in the refcount btree. */ +int +xfs_refcount_free_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, + fsb, len); +} + +struct xfs_refcount_recovery { + struct list_head rr_list; + struct xfs_refcount_irec rr_rrec; +}; + +/* Stuff an extent on the recovery list. */ +STATIC int +xfs_refcount_recover_extent( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct list_head *debris = priv; + struct xfs_refcount_recovery *rr; + + if (be32_to_cpu(rec->refc.rc_refcount) != 1) + return -EFSCORRUPTED; + + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); + list_add_tail(&rr->rr_list, debris); + + return 0; +} + +/* Find and remove leftover CoW reservations. */ +int +xfs_refcount_recover_cow_leftovers( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_trans *tp; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_refcount_recovery *rr, *n; + struct list_head debris; + union xfs_btree_irec low; + union xfs_btree_irec high; + struct xfs_defer_ops dfops; + xfs_fsblock_t fsb; + xfs_agblock_t agbno; + int error; + + if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + return -EOPNOTSUPP; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ + INIT_LIST_HEAD(&debris); + memset(&low, 0, sizeof(low)); + memset(&high, 0, sizeof(high)); + low.rc.rc_startblock = XFS_REFC_COW_START; + high.rc.rc_startblock = -1U; + error = xfs_btree_query_range(cur, &low, &high, + xfs_refcount_recover_extent, &debris); + if (error) + goto out_cursor; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + /* Now iterate the list to free the leftovers */ + list_for_each_entry(rr, &debris, rr_list) { + /* Set up transaction. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + goto out_free; + + trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + + /* Free the orphan record */ + xfs_defer_init(&dfops, &fsb); + agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; + fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount); + if (error) + goto out_defer; + + /* Free the block. */ + xfs_bmap_add_free(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount, NULL); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_defer; + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + } + +out_free: + /* Free the leftover list */ + list_for_each_entry_safe(rr, n, &debris, rr_list) { + list_del(&rr->rr_list); + kmem_free(rr); + } + return error; + +out_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_buf_relse(agbp); + goto out_free; + +out_defer: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_free; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h new file mode 100644 index 0000000..098dc66 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_H__ +#define __XFS_REFCOUNT_H__ + +extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); + +enum xfs_refcount_intent_type { + XFS_REFCOUNT_INCREASE = 1, + XFS_REFCOUNT_DECREASE, + XFS_REFCOUNT_ALLOC_COW, + XFS_REFCOUNT_FREE_COW, +}; + +struct xfs_refcount_intent { + struct list_head ri_list; + enum xfs_refcount_intent_type ri_type; + xfs_fsblock_t ri_startblock; + xfs_extlen_t ri_blockcount; +}; + +extern int xfs_refcount_increase_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); +extern int xfs_refcount_decrease_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); + +extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +extern int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur); + +extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_end_of_shared); + +extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, + xfs_agnumber_t agno); + +#endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c new file mode 100644 index 0000000..453bb27 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_rmap.h" + +static struct xfs_btree_cur * +xfs_refcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_private.a.dfops); +} + +STATIC void +xfs_refcountbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_refcount_root = ptr->s; + be32_add_cpu(&agf->agf_refcount_level, inc); + pag->pagf_refcount_level += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, + XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); +} + +STATIC int +xfs_refcountbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_alloc_arg args; /* block allocation args */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + xfs_refc_block(args.mp)); + args.firstblock = args.fsbno; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.minlen = args.maxlen = args.prod = 1; + args.resv = XFS_AG_RESV_METADATA; + + error = xfs_alloc_vextent(&args); + if (error) + goto out_error; + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + args.agbno, 1); + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.len == 1); + + new->s = cpu_to_be32(args.agbno); + be32_add_cpu(&agf->agf_refcount_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out_error: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_refcountbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; + int error; + + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + be32_add_cpu(&agf->agf_refcount_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_METADATA); + if (error) + return error; + + return error; +} + +STATIC int +xfs_refcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mnr[level != 0]; +} + +STATIC int +xfs_refcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mxr[level != 0]; +} + +STATIC void +xfs_refcountbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_refcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_refcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_refcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_refcount_root != 0); + + ptr->s = agf->agf_refcount_root; +} + +STATIC __int64_t +xfs_refcountbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_refcount_irec *rec = &cur->bc_rec.rc; + struct xfs_refcount_key *kp = &key->refc; + + return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; +} + +STATIC __int64_t +xfs_refcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC bool +xfs_refcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_refcount_level) + return false; + } else if (level >= mp->m_refc_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); +} + +STATIC void +xfs_refcountbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_refcountbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +STATIC void +xfs_refcountbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_refcountbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_refcountbt_buf_ops = { + .name = "xfs_refcountbt", + .verify_read = xfs_refcountbt_read_verify, + .verify_write = xfs_refcountbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_refcountbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_refcountbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} +#endif + +static const struct xfs_btree_ops xfs_refcountbt_ops = { + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + + .dup_cursor = xfs_refcountbt_dup_cursor, + .set_root = xfs_refcountbt_set_root, + .alloc_block = xfs_refcountbt_alloc_block, + .free_block = xfs_refcountbt_free_block, + .get_minrecs = xfs_refcountbt_get_minrecs, + .get_maxrecs = xfs_refcountbt_get_maxrecs, + .init_key_from_rec = xfs_refcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, + .key_diff = xfs_refcountbt_key_diff, + .buf_ops = &xfs_refcountbt_buf_ops, + .diff_two_keys = xfs_refcountbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_refcountbt_keys_inorder, + .recs_inorder = xfs_refcountbt_recs_inorder, +#endif +}; + +/* + * Allocate a new refcount btree cursor. + */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + struct xfs_defer_ops *dfops) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agno < mp->m_sb.sb_agcount); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = XFS_BTNUM_REFC; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_refcountbt_ops; + + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + cur->bc_private.a.dfops = dfops; + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.priv.refc.nr_ops = 0; + cur->bc_private.a.priv.refc.shape_changes = 0; + + return cur; +} + +/* + * Calculate the number of records in a refcount btree block. + */ +int +xfs_refcountbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + bool leaf) +{ + blocklen -= XFS_REFCOUNT_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + +/* Compute the maximum height of a refcount btree. */ +void +xfs_refcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_refcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_refcountbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_refc_mxr[0] == 0) + return 0; + + return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_refcountbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + *ask += xfs_refcountbt_max_size(mp); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_refcount_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h new file mode 100644 index 0000000..3be7768 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_BTREE_H__ +#define __XFS_REFCOUNT_BTREE_H__ + +/* + * Reference Count Btree on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size + */ +#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_REFCOUNT_REC_ADDR(block, index) \ + ((struct xfs_refcount_rec *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_refcount_rec)))) + +#define XFS_REFCOUNT_KEY_ADDR(block, index) \ + ((struct xfs_refcount_key *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct xfs_refcount_key))) + +#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \ + ((xfs_refcount_ptr_t *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (maxrecs) * sizeof(struct xfs_refcount_key) + \ + ((index) - 1) * sizeof(xfs_refcount_ptr_t))) + +extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_defer_ops *dfops); +extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, + bool leaf); +extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); + +extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); + +extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + +#endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 73d0540..3a8cc71 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -148,6 +148,37 @@ done: return error; } +STATIC int +xfs_rmap_delete( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + + error = xfs_btree_delete(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_delete_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + static int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, @@ -180,6 +211,160 @@ xfs_rmap_get_rec( return xfs_rmap_btrec_to_irec(rec, irec); } +struct xfs_find_left_neighbor_info { + struct xfs_rmap_irec high; + struct xfs_rmap_irec *irec; + int *stat; +}; + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_find_left_neighbor_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and adjacent physical and logical + * block ranges. + */ +int +xfs_rmap_find_left_neighbor( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + *stat = 0; + if (bno == 0) + return 0; + info.high.rm_startblock = bno - 1; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && + !(flags & XFS_RMAP_BMBT_BLOCK)) { + if (offset == 0) + return 0; + info.high.rm_offset = offset - 1; + } else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_lookup_le_range_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + (rec->rm_offset > info->high.rm_offset || + rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and overlapping physical and logical + * block ranges. This is the overlapping-interval version of + * xfs_rmap_lookup_le. + */ +int +xfs_rmap_lookup_le_range( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + info.high.rm_startblock = bno; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK)) + info.high.rm_offset = offset; + else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + *stat = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_lookup_le_range(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + /* * Find the extent in the rmap btree and remove it. * @@ -1093,11 +1278,704 @@ done: return error; } +/* + * Convert an unwritten extent to a real extent or vice versa. If there is no + * possibility of overlapping extents, delegate to the simpler convert + * function. + */ +STATIC int +xfs_rmap_convert_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for and exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + &PREV, &i); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext, + &LEFT, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + newext, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += RIGHT.rm_blockcount; + NEW.rm_flags = RIGHT.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW = RIGHT; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new middle extent - newext */ + NEW.rm_startblock = bno; + NEW.rm_blockcount = len; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_flags = newext; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + #undef NEW #undef LEFT #undef RIGHT #undef PREV +/* + * Find an extent in the rmap btree and unmap it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _free function. + */ +STATIC int +xfs_rmap_unmap_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltoff = ltrec.rm_offset; + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Check the offset. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, + out_error); + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* Exact match, simply remove the record from rmap tree. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock == bno) { + /* + * Overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + + /* Delete prev rmap. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + + /* Add an rmap at the new offset. */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + ltrec.rm_offset += len; + error = xfs_rmap_insert(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * Overlap right hand side of extent: trim the length and + * update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + /* + * Overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + /* Shrink the left side of the rmap */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + /* Add an rmap at the new offset */ + error = xfs_rmap_insert(cur, bno + len, + orig_len - len - ltrec.rm_blockcount, + ltrec.rm_owner, offset + len, + ltrec.rm_flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find an extent in the rmap btree and map it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _alloc function. + */ +STATIC int +xfs_rmap_map_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, + <rec, &have_lt); + if (error) + goto out_error; + if (have_lt && + !xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + flags, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + ltrec.rm_offset + ltrec.rm_blockcount == offset) { + /* + * Left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } + + /* Point the cursor back to the left record and update. */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + /* Delete the old record. */ + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + + /* Move the start and re-add it. */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + gtrec.rm_offset = offset; + error = xfs_rmap_insert(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } else { + /* + * No contiguous edge with identical owner, insert + * new record at current cursor position. + */ + error = xfs_rmap_insert(cur, bno, len, owner, offset, flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -1237,15 +2115,27 @@ xfs_rmap_finish_one( case XFS_RMAP_MAP: error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_MAP_SHARED: + error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_UNMAP_SHARED: + error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_CONVERT: error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, &oinfo); break; + case XFS_RMAP_CONVERT_SHARED: + error = xfs_rmap_convert_shared(rcur, bno, blockcount, + !unwritten, &oinfo); + break; default: ASSERT(0); error = -EFSCORRUPTED; @@ -1263,9 +2153,10 @@ out_cur: */ static bool xfs_rmap_update_is_needed( - struct xfs_mount *mp) + struct xfs_mount *mp, + int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb); + return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; } /* @@ -1311,10 +2202,11 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } @@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } @@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); @@ -1386,7 +2280,7 @@ xfs_rmap_free_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 71cf99a..7899305 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 17b8eeb..83e672f 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -35,6 +35,7 @@ #include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" /* * Reverse map btree. @@ -512,6 +513,83 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, - mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + /* + * On a non-reflink filesystem, the maximum number of rmap + * records is the number of blocks in the AG, hence the max + * rmapbt height is log_$maxrecs($agblocks). However, with + * reflink each AG block can have up to 2^32 (per the refcount + * record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. + * + * That effectively means that the max rmapbt height must be + * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG + * blocks to feed the rmapbt long before the rmapbt reaches + * maximum height. The reflink code uses ag_resv_critical to + * disallow reflinking when less than 10% of the per-AG metadata + * block reservation since the fallback is a regular file copy. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; + else + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_rmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_rmapbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rmap_mxr[0] == 0) + return 0; + + return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_rmapbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t pool_len; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + /* Reserve 1% of the AG or enough for 1 block per record. */ + pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); + *ask += pool_len; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_rmap_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index e73a553..2a9ac47 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); + +extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4aecc5f..a70aec9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -38,6 +38,8 @@ #include "xfs_ialloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -737,6 +739,13 @@ xfs_sb_mount_common( mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; + mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 0c5b30b..c6f4eb4 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 +#define XFS_REFC_BTREE_REF 1 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 301ef2f..b456cca 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -67,13 +67,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). The number of blocks reserved is based on the formula: + * (rmapbt). With reflink, there are four trees (refcountbt). The number of + * blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ -static uint +uint xfs_allocfree_log_count( struct xfs_mount *mp, uint num_ops) @@ -83,6 +84,8 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } @@ -809,11 +812,18 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + else + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -870,7 +880,10 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0eb46ed..b7e5357 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -87,6 +87,7 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -96,11 +97,13 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); +uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 41e0428..7917f6e 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,6 +21,8 @@ /* * Components of space reservations. */ +#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ + (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -28,6 +30,13 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w) + \ + ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + (mp)->m_rmap_maxlevels) #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3d50364..8d74870 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ */ #define XFS_DATA_FORK 0 #define XFS_ATTR_FORK 1 +#define XFS_COW_FORK 2 /* * Min numbers of data/attr fork btree root pointers. @@ -109,7 +110,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6e527b..b468e04 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -236,7 +236,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) iattr.ia_valid = ATTR_MODE | ATTR_CTIME; iattr.ia_mode = mode; - iattr.ia_ctime = current_fs_time(inode->i_sb); + iattr.ia_ctime = current_time(inode); error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } @@ -257,16 +257,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - - if (error <= 0) { - acl = NULL; - - if (error < 0) - return error; - } + umode_t mode; + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; error = xfs_set_mode(inode, mode); if (error) return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7575cfc..3e57a56 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -39,6 +40,7 @@ /* flags for direct write completions */ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) +#define XFS_DIO_FLAG_COW (1 << 2) /* * structure owned by writepages passed to individual writepage calls @@ -200,7 +202,7 @@ xfs_setfilesize_trans_alloc( * Update on-disk file size now that data has been written to disk. */ STATIC int -xfs_setfilesize( +__xfs_setfilesize( struct xfs_inode *ip, struct xfs_trans *tp, xfs_off_t offset, @@ -225,6 +227,23 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +int +xfs_setfilesize( + struct xfs_inode *ip, + xfs_off_t offset, + size_t size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) + return error; + + return __xfs_setfilesize(ip, tp, offset, size); +} + STATIC int xfs_setfilesize_ioend( struct xfs_ioend *ioend, @@ -247,7 +266,7 @@ xfs_setfilesize_ioend( return error; } - return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); + return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* @@ -270,6 +289,25 @@ xfs_end_io( error = -EIO; /* + * For a CoW extent, we need to move the mapping from the CoW fork + * to the data fork. If instead an error happened, just dump the + * new blocks. + */ + if (ioend->io_type == XFS_IO_COW) { + if (error) + goto done; + if (ioend->io_bio->bi_error) { + error = xfs_reflink_cancel_cow_range(ip, + ioend->io_offset, ioend->io_size); + goto done; + } + error = xfs_reflink_end_cow(ip, ioend->io_offset, + ioend->io_size); + if (error) + goto done; + } + + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. * Detecting and handling completion IO errors is done individually @@ -284,7 +322,8 @@ xfs_end_io( } else if (ioend->io_append_trans) { error = xfs_setfilesize_ioend(ioend, error); } else { - ASSERT(!xfs_ioend_is_append(ioend)); + ASSERT(!xfs_ioend_is_append(ioend) || + ioend->io_type == XFS_IO_COW); } done: @@ -298,7 +337,7 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -324,6 +363,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -338,6 +378,13 @@ xfs_map_blocks( offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, imap, &nimaps, bmapi_flags); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This forces us + * to come back to writepage to take care of the CoW. + */ + if (nimaps && type == XFS_IO_OVERWRITE) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -345,7 +392,8 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, imap); + error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, + imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return error; @@ -720,6 +768,56 @@ out_invalidate: return; } +static int +xfs_map_cow( + struct xfs_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int *new_type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_bmbt_irec imap; + bool is_cow = false, need_alloc = false; + int error; + + /* + * If we already have a valid COW mapping keep using it. + */ + if (wpc->io_type == XFS_IO_COW) { + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); + if (wpc->imap_valid) { + *new_type = XFS_IO_COW; + return 0; + } + } + + /* + * Else we need to check if there is a COW mapping at this offset. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!is_cow) + return 0; + + /* + * And if the COW mapping has a delayed extent here we need to + * allocate real space for it now. + */ + if (need_alloc) { + error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, + &imap); + if (error) + return error; + } + + wpc->io_type = *new_type = XFS_IO_COW; + wpc->imap_valid = true; + wpc->imap = imap; + return 0; +} + /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate @@ -752,6 +850,7 @@ xfs_writepage_map( int error = 0; int count = 0; int uptodate = 1; + unsigned int new_type; bh = head = page_buffers(page); offset = page_offset(page); @@ -772,22 +871,13 @@ xfs_writepage_map( continue; } - if (buffer_unwritten(bh)) { - if (wpc->io_type != XFS_IO_UNWRITTEN) { - wpc->io_type = XFS_IO_UNWRITTEN; - wpc->imap_valid = false; - } - } else if (buffer_delay(bh)) { - if (wpc->io_type != XFS_IO_DELALLOC) { - wpc->io_type = XFS_IO_DELALLOC; - wpc->imap_valid = false; - } - } else if (buffer_uptodate(bh)) { - if (wpc->io_type != XFS_IO_OVERWRITE) { - wpc->io_type = XFS_IO_OVERWRITE; - wpc->imap_valid = false; - } - } else { + if (buffer_unwritten(bh)) + new_type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + new_type = XFS_IO_DELALLOC; + else if (buffer_uptodate(bh)) + new_type = XFS_IO_OVERWRITE; + else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* @@ -800,6 +890,17 @@ xfs_writepage_map( continue; } + if (xfs_is_reflink_inode(XFS_I(inode))) { + error = xfs_map_cow(wpc, inode, offset, &new_type); + if (error) + goto out; + } + + if (wpc->io_type != new_type) { + wpc->io_type = new_type; + wpc->imap_valid = false; + } + if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); @@ -1090,18 +1191,24 @@ xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool is_cow) { uintptr_t *flags = (uintptr_t *)&bh_result->b_private; xfs_off_t size = bh_result->b_size; trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); + ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : + XFS_IO_OVERWRITE, imap); if (ISUNWRITTEN(imap)) { *flags |= XFS_DIO_FLAG_UNWRITTEN; set_buffer_defer_completion(bh_result); - } else if (offset + size > i_size_read(inode) || offset + size < 0) { + } else if (is_cow) { + *flags |= XFS_DIO_FLAG_COW; + set_buffer_defer_completion(bh_result); + } + if (offset + size > i_size_read(inode) || offset + size < 0) { *flags |= XFS_DIO_FLAG_APPEND; set_buffer_defer_completion(bh_result); } @@ -1147,6 +1254,44 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } +/* Bounce unaligned directio writes to the page cache. */ +static int +xfs_bounce_unaligned_dio_write( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t delta; + bool shared; + bool x; + int error; + + irec = *imap; + if (offset_fsb > irec.br_startoff) { + delta = offset_fsb - irec.br_startoff; + irec.br_blockcount -= delta; + irec.br_startblock += delta; + irec.br_startoff = offset_fsb; + } + error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); + if (error) + return error; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If any part + * of the extent is shared, fall back to buffered mode to handle + * the RMW. This is done by returning -EREMCHG ("remote addr + * changed"), which is caught further up the call stack. + */ + if (shared) { + trace_xfs_reflink_bounce_dio_write(ip, imap); + return -EREMCHG; + } + return 0; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1166,6 +1311,8 @@ __xfs_get_blocks( xfs_off_t offset; ssize_t size; int new = 0; + bool is_cow = false; + bool need_alloc = false; BUG_ON(create && !direct); @@ -1191,8 +1338,26 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (create && direct && xfs_is_reflink_inode(ip)) + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, + &need_alloc); + if (!is_cow) { + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This + * forces us to come back to get_blocks to take care of + * the CoW. + */ + if (create && direct && nimaps && + imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(&imap)) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, + &imap); + } + ASSERT(!need_alloc); if (error) goto out_unlock; @@ -1244,6 +1409,13 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { + if (create && direct && !is_cow) { + error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, + &imap); + if (error) + return error; + } + xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); @@ -1252,7 +1424,8 @@ __xfs_get_blocks( if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + is_cow); } } @@ -1336,13 +1509,12 @@ xfs_end_io_direct_write( { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; uintptr_t flags = (uintptr_t)private; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (size <= 0) @@ -1375,19 +1547,17 @@ xfs_end_io_direct_write( i_size_write(inode, offset + size); spin_unlock(&ip->i_flags_lock); + if (flags & XFS_DIO_FLAG_COW) + error = xfs_reflink_end_cow(ip, offset, size); if (flags & XFS_DIO_FLAG_UNWRITTEN) { trace_xfs_end_io_direct_write_unwritten(ip, offset, size); error = xfs_iomap_write_unwritten(ip, offset, size); - } else if (flags & XFS_DIO_FLAG_APPEND) { - struct xfs_trans *tp; - + } + if (flags & XFS_DIO_FLAG_APPEND) { trace_xfs_end_io_direct_write_append(ip, offset, size); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, - &tp); - if (!error) - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_setfilesize(ip, offset, size); } return error; @@ -1414,6 +1584,17 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); + + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseѕ the file system for actual I/O. We really can't allow + * that on reflinks inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index bf2d9a1..b3c6634 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -28,13 +28,15 @@ enum { XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ + XFS_IO_COW, /* covers copy-on-write extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" } + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. @@ -62,6 +64,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, ssize_t size, void *private); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c new file mode 100644 index 0000000..9bf57c7 --- /dev/null +++ b/fs/xfs/xfs_bmap_item.c @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_bui_zone; +kmem_zone_t *xfs_bud_zone; + +static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bui_log_item, bui_item); +} + +void +xfs_bui_item_free( + struct xfs_bui_log_item *buip) +{ + kmem_zone_free(xfs_bui_zone, buip); +} + +STATIC void +xfs_bui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bui log item. We use only 1 iovec, and we point that + * at the bui_log_format structure embedded in the bui item. + * It is at this point that we assert that all of the extent + * slots in the bui item have been filled. + */ +STATIC void +xfs_bui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&buip->bui_next_extent) == + buip->bui_format.bui_nextents); + + buip->bui_format.bui_type = XFS_LI_BUI; + buip->bui_format.bui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format, + xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents)); +} + +/* + * Pinning has no meaning for an bui item, so just return. + */ +STATIC void +xfs_bui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an BUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the BUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the BUI to either construct + * and commit the BUD or drop the BUD's reference in the event of error. Simply + * drop the log's BUI reference now that the log is done with it. + */ +STATIC void +xfs_bui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + xfs_bui_release(buip); +} + +/* + * BUI items have no locking or pushing. However, since BUIs are pulled from + * the AIL when their corresponding BUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the BUI out of + * the AIL. + */ +STATIC uint +xfs_bui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an BUD isn't going to be + * constructed and thus we free the BUI here directly. + */ +STATIC void +xfs_bui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_bui_item_free(BUI_ITEM(lip)); +} + +/* + * The BUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_bui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The BUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bui log items. + */ +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_pin = xfs_bui_item_pin, + .iop_unpin = xfs_bui_item_unpin, + .iop_unlock = xfs_bui_item_unlock, + .iop_committed = xfs_bui_item_committed, + .iop_push = xfs_bui_item_push, + .iop_committing = xfs_bui_item_committing, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} + +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + +static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bud_log_item, bud_item); +} + +STATIC void +xfs_bud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_bud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bud log item. We use only 1 iovec, and we point that + * at the bud_log_format structure embedded in the bud item. + * It is at this point that we assert that all of the extent + * slots in the bud item have been filled. + */ +STATIC void +xfs_bud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + budp->bud_format.bud_type = XFS_LI_BUD; + budp->bud_format.bud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format, + sizeof(struct xfs_bud_log_format)); +} + +/* + * Pinning has no meaning for an bud item, so just return. + */ +STATIC void +xfs_bud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an bud item, unpinning does + * not either. + */ +STATIC void +xfs_bud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an bud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_bud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. + */ +STATIC void +xfs_bud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + } +} + +/* + * When the bud item is committed to disk, all we need to do is delete our + * reference to our partner bui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_bud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + /* + * Drop the BUI reference regardless of whether the BUD has been + * aborted. Once the BUD transaction is constructed, it is the sole + * responsibility of the BUD to release the BUI (even if the BUI is + * aborted due to log I/O error). + */ + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + + return (xfs_lsn_t)-1; +} + +/* + * The BUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bud log items. + */ +static const struct xfs_item_ops xfs_bud_item_ops = { + .iop_size = xfs_bud_item_size, + .iop_format = xfs_bud_item_format, + .iop_pin = xfs_bud_item_pin, + .iop_unpin = xfs_bud_item_unpin, + .iop_unlock = xfs_bud_item_unlock, + .iop_committed = xfs_bud_item_committed, + .iop_push = xfs_bud_item_push, + .iop_committing = xfs_bud_item_committing, +}; + +/* + * Allocate and initialize an bud item with the given number of extents. + */ +struct xfs_bud_log_item * +xfs_bud_init( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) + +{ + struct xfs_bud_log_item *budp; + + budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return budp; +} + +/* + * Process a bmap update intent item that was recovered from the log. + * We need to update some inode's bmbt. + */ +int +xfs_bui_recover( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) +{ + int error = 0; + unsigned int bui_type; + struct xfs_map_extent *bmap; + xfs_fsblock_t startblock_fsb; + xfs_fsblock_t inode_fsb; + bool op_ok; + struct xfs_bud_log_item *budp; + enum xfs_bmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_defer_ops dfops; + xfs_fsblock_t firstfsb; + + ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + + /* Only one mapping operation per BUI... */ + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + /* + * First check the validity of the extent described by the + * BUI. If anything is bad, then toss the BUI. + */ + bmap = &buip->bui_format.bui_extents[0]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); + inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, + XFS_INO_TO_FSB(mp, bmap->me_owner))); + switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + bmap->me_len == 0 || + inode_fsb == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + bmap->me_len >= mp->m_sb.sb_agblocks || + inode_fsb >= mp->m_sb.sb_dblocks || + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { + /* + * This will pull the BUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + budp = xfs_trans_get_bud(tp, buip); + + /* Grab the inode. */ + error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + if (error) + goto err_inode; + + if (VFS_I(ip)->i_nlink == 0) + xfs_iflags_set(ip, XFS_IRECOVERY); + xfs_defer_init(&dfops, &firstfsb); + + /* Process deferred bmap item. */ + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + type = bui_type; + break; + default: + error = -EFSCORRUPTED; + goto err_dfops; + } + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + ip, whichfork, bmap->me_startoff, + bmap->me_startblock, bmap->me_len, + state); + if (error) + goto err_dfops; + + /* Finish transaction, free inodes. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto err_dfops; + + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + + return error; + +err_dfops: + xfs_defer_cancel(&dfops); +err_inode: + xfs_trans_cancel(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + } + return error; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h new file mode 100644 index 0000000..c867daa --- /dev/null +++ b/fs/xfs/xfs_bmap_item.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_BMAP_ITEM_H__ +#define __XFS_BMAP_ITEM_H__ + +/* + * There are (currently) two pairs of bmap btree redo item types: map & unmap. + * The common abbreviations for these are BUI (bmap update intent) and BUD + * (bmap update done). The redo item type is encoded in the flags field of + * each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * bmbt metadata updates in the non-first transaction. + */ + +/* kernel only BUI/BUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_BUI_MAX_FAST_EXTENTS 1 + +/* + * Define BUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_BUI_RECOVERED 1 + +/* + * This is the "bmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "bmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_bui_log_item { + struct xfs_log_item bui_item; + atomic_t bui_refcount; + atomic_t bui_next_extent; + unsigned long bui_flags; /* misc flags */ + struct xfs_bui_log_format bui_format; +}; + +static inline size_t +xfs_bui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_bui_log_item, bui_format) + + xfs_bui_log_format_sizeof(nr); +} + +/* + * This is the "bmap update done" log item. It is used to log the fact that + * some bmbt updates mentioned in an earlier bui item have been performed. + */ +struct xfs_bud_log_item { + struct xfs_log_item bud_item; + struct xfs_bui_log_item *bud_buip; + struct xfs_bud_log_format bud_format; +}; + +extern struct kmem_zone *xfs_bui_zone; +extern struct kmem_zone *xfs_bud_zone; + +struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); +struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, + struct xfs_bui_log_item *); +void xfs_bui_item_free(struct xfs_bui_log_item *); +void xfs_bui_release(struct xfs_bui_log_item *); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); + +#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4ece4f2..552465e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -42,6 +42,9 @@ #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_refcount.h" /* Kernel only BMAP related definitions and functions */ @@ -182,7 +185,7 @@ xfs_bmap_rtalloc( XFS_TRANS_DQ_RTBCOUNT, (long) ralen); /* Zero the extent if we were asked to do so */ - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); if (error) return error; @@ -389,11 +392,13 @@ xfs_bmap_count_blocks( STATIC int xfs_getbmapx_fix_eof_hole( xfs_inode_t *ip, /* xfs incore inode pointer */ + int whichfork, struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) + xfs_fsblock_t startblock, + bool moretocome) { __int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ @@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole( else out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!moretocome && + xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole( return 1; } +/* Adjust the reported bmap around shared/unshared extent transitions. */ +STATIC int +xfs_getbmap_adjust_shared( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *map, + struct getbmapx *out, + struct xfs_bmbt_irec *next_map) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t ebno; + xfs_extlen_t elen; + xfs_extlen_t nlen; + int error; + + next_map->br_startblock = NULLFSBLOCK; + next_map->br_startoff = NULLFILEOFF; + next_map->br_blockcount = 0; + + /* Only written data blocks can be shared. */ + if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || + map->br_startblock == DELAYSTARTBLOCK || + map->br_startblock == HOLESTARTBLOCK || + ISUNWRITTEN(map)) + return 0; + + agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); + error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, + &ebno, &elen, true); + if (error) + return error; + + if (ebno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (agbno == ebno) { + /* + * Shared extent at (agbno, elen). Shrink the reported + * extent length and prepare to move the start of map[i] + * to agbno+elen, with the aim of (re)formatting the new + * map[i] the next time through the inner loop. + */ + out->bmv_length = XFS_FSB_TO_BB(mp, elen); + out->bmv_oflags |= BMV_OF_SHARED; + if (elen != map->br_blockcount) { + *next_map = *map; + next_map->br_startblock += elen; + next_map->br_startoff += elen; + next_map->br_blockcount -= elen; + } + map->br_blockcount -= elen; + } else { + /* + * There's an unshared extent (agbno, ebno - agbno) + * followed by shared extent at (ebno, elen). Shrink + * the reported extent length to cover only the unshared + * extent and prepare to move up the start of map[i] to + * ebno, with the aim of (re)formatting the new map[i] + * the next time through the inner loop. + */ + *next_map = *map; + nlen = ebno - agbno; + out->bmv_length = XFS_FSB_TO_BB(mp, nlen); + next_map->br_startblock += nlen; + next_map->br_startoff += nlen; + next_map->br_blockcount -= nlen; + map->br_blockcount -= nlen; + } + + return 0; +} + /* * Get inode's extents as described in bmv, and format for output. * Calls formatter to fill the user's buffer until all extents @@ -459,12 +540,28 @@ xfs_getbmap( int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ int cur_ext = 0; + struct xfs_bmbt_irec inject_map; mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - if (whichfork == XFS_ATTR_FORK) { +#ifndef DEBUG + /* Only allow CoW fork queries if we're debugging. */ + if (iflags & BMV_IF_COWFORK) + return -EINVAL; +#endif + if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) + return -EINVAL; + + if (iflags & BMV_IF_ATTRFORK) + whichfork = XFS_ATTR_FORK; + else if (iflags & BMV_IF_COWFORK) + whichfork = XFS_COW_FORK; + else + whichfork = XFS_DATA_FORK; + + switch (whichfork) { + case XFS_ATTR_FORK: if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && @@ -480,7 +577,20 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; - } else { + break; + case XFS_COW_FORK: + if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) + return -EINVAL; + + if (xfs_get_cowextsz_hint(ip)) { + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + break; + default: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -494,6 +604,7 @@ xfs_getbmap( prealloced = 0; fixlen = XFS_ISIZE(ip); } + break; } if (bmv->bmv_length == -1) { @@ -520,7 +631,8 @@ xfs_getbmap( return -ENOMEM; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK) { + switch (whichfork) { + case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -538,8 +650,14 @@ xfs_getbmap( } lock = xfs_ilock_data_map_shared(ip); - } else { + break; + case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + break; + case XFS_ATTR_FORK: lock = xfs_ilock_attr_map_shared(ip); + break; } /* @@ -581,7 +699,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + for (i = 0; i < nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -614,9 +733,16 @@ xfs_getbmap( goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) + /* Is this a shared block? */ + error = xfs_getbmap_adjust_shared(ip, whichfork, + &map[i], &out[cur_ext], &inject_map); + if (error) + goto out_free_map; + + if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, + &out[cur_ext], prealloced, bmvend, + map[i].br_startblock, + inject_map.br_startblock != NULLFSBLOCK)) goto out_free_map; bmv->bmv_offset = @@ -636,11 +762,16 @@ xfs_getbmap( continue; } - nexleft--; + if (inject_map.br_startblock != NULLFSBLOCK) { + map[i] = inject_map; + i--; + } else + nexleft--; bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length); + } while (nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count); out_free_map: kmem_free(map); @@ -1433,8 +1564,8 @@ xfs_insert_file_space( */ static int xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip) /* tmp inode */ { /* Should never get a local format */ @@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format( return -EINVAL; /* + * If we have to use the (expensive) rmap swap method, we can + * handle any number of extents and any format. + */ + if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + return 0; + + /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. @@ -1518,125 +1656,161 @@ xfs_swap_extent_flush( return 0; } -int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) +/* + * Move extents from one file to another, when rmap is enabled. + */ +STATIC int +xfs_swap_extent_rmap( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tip) { - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - int lock_flags; - - /* XXX: we can't do this with rmap, will fix later */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - return -EOPNOTSUPP; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = -ENOMEM; - goto out; - } + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_bmbt_irec tirec; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + __uint64_t tip_flags2; /* - * Lock the inodes against other IO, page faults and truncate to - * begin with. Then we can ensure the inodes are flushed and have no - * page cache safely. Once we have done this we can take the ilocks and - * do the rest of the checks. + * If the source file has shared blocks, we must flag the donor + * file as having shared blocks so that we get the shared-block + * rmap functions when we go to fix up the rmaps. The flags + * will be switch for reals later. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); - - /* Verify that both files have the same format */ - if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { - error = -EINVAL; - goto out_unlock; - } + tip_flags2 = tip->i_d.di_flags2; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) + tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + + offset_fsb = 0; + end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + while (count_fsb) { + /* Read extent from the donor file */ + nimaps = 1; + error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, + &nimaps, 0); + if (error) + goto out; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); + + trace_xfs_swap_extent_rmap_remap(tip, &tirec); + ilen = tirec.br_blockcount; + + /* Unmap the old blocks in the source file. */ + while (tirec.br_blockcount) { + xfs_defer_init(&dfops, &firstfsb); + trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); + + /* Read extent from the source file */ + nimaps = 1; + error = xfs_bmapi_read(ip, tirec.br_startoff, + tirec.br_blockcount, &irec, + &nimaps, 0); + if (error) + goto out_defer; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startoff == irec.br_startoff); + trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); + + /* Trim the extent. */ + uirec = tirec; + uirec.br_blockcount = rlen = min_t(xfs_filblks_t, + tirec.br_blockcount, + irec.br_blockcount); + trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + + /* Remove the mapping from the donor file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + tip, &uirec); + if (error) + goto out_defer; - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = -EINVAL; - goto out_unlock; - } + /* Remove the mapping from the source file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + ip, &irec); + if (error) + goto out_defer; - error = xfs_swap_extent_flush(ip); - if (error) - goto out_unlock; - error = xfs_swap_extent_flush(tip); - if (error) - goto out_unlock; + /* Map the donor file's blocks into the source file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - goto out_unlock; + /* Map the source file's blocks into the donor file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + tip, &irec); + if (error) + goto out_defer; - /* - * Lock and join the inodes to the tansaction so that transaction commit - * or cancel will unlock the inodes from this point onwards. - */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) + goto out_defer; + tirec.br_startoff += rlen; + if (tirec.br_startblock != HOLESTARTBLOCK && + tirec.br_startblock != DELAYSTARTBLOCK) + tirec.br_startblock += rlen; + tirec.br_blockcount -= rlen; + } - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = -EFAULT; - goto out_trans_cancel; + /* Roll on... */ + count_fsb -= ilen; + offset_fsb += ilen; } - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); + tip->i_d.di_flags2 = tip_flags2; + return 0; - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_trans_cancel; - } +out_defer: + xfs_defer_cancel(&dfops); +out: + trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); + tip->i_d.di_flags2 = tip_flags2; + return error; +} + +/* Swap the extents of two files by swapping data forks. */ +STATIC int +xfs_swap_extent_forks( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_inode *tip, + int *src_log_flags, + int *target_log_flags) +{ + struct xfs_ifork tempifp, *ifp, *tifp; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int error; - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = -EBUSY; - goto out_trans_cancel; - } /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + &aforkblks); if (error) - goto out_trans_cancel; + return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); + &taforkblks); if (error) - goto out_trans_cancel; + return error; } /* @@ -1645,31 +1819,23 @@ xfs_swap_extents( * buffers, and so the validation done on read will expect the owner * field to be correctly set. Once we change the owners, we can swap the * inode forks. - * - * Note the trickiness in setting the log flags - we set the owner log - * flag on the opposite inode (i.e. the inode we are setting the new - * owner to be) because once we swap the forks and log that, log - * recovery is going to see the fork as owned by the swapped inode, - * not the pre-swapped inodes. */ - src_log_flags = XFS_ILOG_CORE; - target_log_flags = XFS_ILOG_CORE; if (ip->i_d.di_version == 3 && ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - target_log_flags |= XFS_ILOG_DOWNER; + (*target_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } if (tip->i_d.di_version == 3 && tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - src_log_flags |= XFS_ILOG_DOWNER; + (*src_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } /* @@ -1677,9 +1843,9 @@ xfs_swap_extents( */ ifp = &ip->i_df; tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ + tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ + *tifp = tempifp; /* struct copy */ /* * Fix the on-disk inode values @@ -1719,12 +1885,12 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - src_log_flags |= XFS_ILOG_DEXT; + (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(ip->i_d.di_version < 3 || - (src_log_flags & XFS_ILOG_DOWNER)); - src_log_flags |= XFS_ILOG_DBROOT; + (*src_log_flags & XFS_ILOG_DOWNER)); + (*src_log_flags) |= XFS_ILOG_DBROOT; break; } @@ -1738,15 +1904,166 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - target_log_flags |= XFS_ILOG_DEXT; + (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; + (*target_log_flags) |= XFS_ILOG_DBROOT; ASSERT(tip->i_d.di_version < 3 || - (target_log_flags & XFS_ILOG_DOWNER)); + (*target_log_flags & XFS_ILOG_DOWNER)); break; } + return 0; +} + +int +xfs_swap_extents( + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip, /* tmp inode */ + struct xfs_swapext *sxp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bstat *sbp = &sxp->sx_stat; + int src_log_flags, target_log_flags; + int error = 0; + int lock_flags; + struct xfs_ifork *cowfp; + __uint64_t f; + int resblks; + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + /* + * Extent "swapping" with rmap requires a permanent reservation and + * a block reservation because it's really just a remap operation + * performed with log redo items! + */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Conceptually this shouldn't affect the shape of either + * bmbt, but since we atomically move extents one by one, + * we reserve enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), + XFS_DATA_FORK) + + XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), + XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + 0, 0, &tp); + } else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, + 0, 0, &tp); + if (error) + goto out_unlock; + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tip, 0); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + + /* + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + error = xfs_swap_extent_rmap(&tp, ip, tip); + else + error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, + &target_log_flags); + if (error) + goto out_trans_cancel; + + /* Do we have to swap reflink flags? */ + if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ + (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { + f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + cowfp = ip->i_cowfp; + ip->i_cowfp = tip->i_cowfp; + tip->i_cowfp = cowfp; + xfs_inode_set_cowblocks_tag(ip); + xfs_inode_set_cowblocks_tag(tip); + } + xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); @@ -1761,16 +2078,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; -out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); - goto out; + return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + return error; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e455f90..2975cb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -865,7 +865,7 @@ xfs_buf_item_log_segment( */ if (bit) { end_bit = MIN(bit + bits_to_set, (uint)NBWORD); - mask = ((1 << (end_bit - bit)) - 1) << bit; + mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; bits_set = end_bit - bit; @@ -888,7 +888,7 @@ xfs_buf_item_log_segment( */ end_bit = bits_to_set - bits_set; if (end_bit) { - mask = (1 << end_bit) - 1; + mask = (1U << end_bit) - 1; *wordp |= mask; } } @@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error( bp->b_last_error != bp->b_error) { bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - if (cfg->retry_timeout && !bp->b_first_retry_time) + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); @@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error( if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) goto permanent_error; - if (cfg->retry_timeout && + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) goto permanent_error; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f44f799..2981698 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -84,7 +84,8 @@ xfs_dir2_sf_getdents( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; /* * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3d22470..05f8666 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_BMAPIFORMAT 21 #define XFS_ERRTAG_FREE_EXTENT 22 #define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_MAX 24 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +#define XFS_ERRTAG_MAX 28 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #define XFS_RANDOM_FREE_EXTENT 1 #define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 #ifdef DEBUG extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index c263e07..162dc18 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -384,7 +384,7 @@ restart: * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!args->userdata && + if (!xfs_alloc_is_userdata(args->datatype) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a02..6e4f7f9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" #include "xfs_iomap.h" +#include "xfs_reflink.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -248,6 +249,7 @@ xfs_file_dio_aio_read( struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); + loff_t end = iocb->ki_pos + count - 1; struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; @@ -269,61 +271,35 @@ xfs_file_dio_aio_read( return -EINVAL; } - /* - * Locking is a bit tricky here. If we take an exclusive lock for direct - * IO, we effectively serialise all new concurrent read IO to this file - * and block it behind IO that is currently in progress because IO in - * progress holds the IO lock shared. We only need to hold the lock - * exclusive to blow away the page cache, so only take lock exclusively - * if the page cache needs invalidation. This allows the normal direct - * IO case of no page cache pages to proceeed concurrently without - * serialisation. - */ + file_accessed(iocb->ki_filp); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_unlock; /* - * The generic dio code only flushes the range of the particular - * I/O. Because we take an exclusive lock here, this whole - * sequence is considerably more expensive for us. This has a - * noticeable performance impact for any file with cached pages, - * even when outside of the range of the particular I/O. - * - * Hence, amortize the cost of the lock against a full file - * flush and reduce the chances of repeated iolock cycles going - * forward. + * Invalidate whole pages. This can return an error if we fail + * to invalidate a page, but this should never happen on XFS. + * Warn if it does fail. */ - if (mapping->nrpages) { - ret = filemap_write_and_wait(mapping); - if (ret) { - xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; - } - - /* - * Invalidate whole pages. This can return an error if - * we fail to invalidate a page, but this should never - * happen on XFS. Warn if it does fail. - */ - ret = invalidate_inode_pages2(mapping); - WARN_ON_ONCE(ret); - ret = 0; - } - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; } data = *to; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, NULL, NULL, 0); - if (ret > 0) { + if (ret >= 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); } - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - file_accessed(iocb->ki_filp); +out_unlock: + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -332,10 +308,7 @@ xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct iov_iter data = *to; + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); size_t count = iov_iter_count(to); ssize_t ret = 0; @@ -345,11 +318,7 @@ xfs_file_dax_read( return 0; /* skip atime */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } + ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -399,45 +368,6 @@ xfs_file_read_iter( return ret; } -STATIC ssize_t -xfs_file_splice_read( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - unsigned int flags) -{ - struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - ssize_t ret; - - XFS_STATS_INC(ip->i_mount, xs_read_calls); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - trace_xfs_file_splice_read(ip, count, *ppos); - - /* - * DAX inodes cannot ues the page cache for splice, so we have to push - * them through the VFS IO path. This means it goes through - * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we - * cannot lock the splice operation at this level for DAX inodes. - */ - if (IS_DAX(VFS_I(ip))) { - ret = default_file_splice_read(infilp, ppos, pipe, count, - flags); - goto out; - } - - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); -out: - if (ret > 0) - XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); - return ret; -} - /* * Zero any on disk space between the current EOF and the new, larger EOF. * @@ -614,61 +544,49 @@ xfs_file_dio_aio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) - unaligned_io = 1; - /* - * We don't need to take an exclusive lock unless there page cache needs - * to be invalidated or unaligned IO is being executed. We don't need to - * consider the EOF extension case here because - * xfs_file_aio_write_checks() will relock the inode as necessary for - * EOF zeroing cases and fill out the new inode size as appropriate. + * Don't take the exclusive iolock here unless the I/O is unaligned to + * the file system block size. We don't need to consider the EOF + * extension case here because xfs_file_aio_write_checks() will relock + * the inode as necessary for EOF zeroing cases and fill out the new + * inode size as appropriate. */ - if (unaligned_io || mapping->nrpages) + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = 1; iolock = XFS_IOLOCK_EXCL; - else + } else { iolock = XFS_IOLOCK_SHARED; - xfs_rw_ilock(ip, iolock); - - /* - * Recheck if there are cached pages that need invalidate after we got - * the iolock to protect against other threads adding new pages while - * we were waiting for the iolock. - */ - if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, iolock); - iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); } + xfs_rw_ilock(ip, iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; - /* - * See xfs_file_dio_aio_read() for why we do a full-file flush here. - */ if (mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); if (ret) goto out; + /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to flush cached pages + * otherwise demote the lock if we had to take the exclusive lock + * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) inode_dio_wait(inode); @@ -679,6 +597,13 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + /* If this is a block-aligned directio CoW, remap immediately. */ + if (xfs_is_reflink_inode(ip) && !unaligned_io) { + ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); + if (ret) + goto out; + } + data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -711,70 +636,32 @@ xfs_file_dax_write( struct kiocb *iocb, struct iov_iter *from) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; - struct iov_iter data; + int iolock = XFS_IOLOCK_EXCL; + ssize_t ret, error = 0; + size_t count; + loff_t pos; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { - unaligned_io = 1; - iolock = XFS_IOLOCK_EXCL; - } else if (mapping->nrpages) { - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - /* - * Yes, even DAX files can have page cache attached to them: A zeroed - * page is inserted into the pagecache when we have to serve a write - * fault on a hole. It should never be dirtied and can simply be - * dropped from the pagecache once we get real data for the page. - * - * XXX: This is racy against mmap, and there's nothing we can do about - * it. dax_do_io() should really do this invalidation internally as - * it will know if we've allocated over a holei for this specific IO and - * if so it needs to update the mapping tree and invalidate existing - * PTEs over the newly allocated range. Remove this invalidation when - * dax_do_io() is fixed up. - */ - if (mapping->nrpages) { - loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; + pos = iocb->ki_pos; + count = iov_iter_count(from); - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } + trace_xfs_file_dax_write(ip, count, pos); - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + error = xfs_setfilesize(ip, pos, ret); } - trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - - data = *from; - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } out: xfs_rw_iunlock(ip, iolock); - return ret; + return error ? error : ret; } STATIC ssize_t @@ -818,6 +705,9 @@ write_retry: enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; @@ -857,10 +747,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) + else if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Allow a directio write to fall back to a buffered + * write *only* in the case that we're doing a reflink + * CoW. In all other directio scenarios we do not + * allow an operation to fall back to buffered mode. + */ ret = xfs_file_dio_aio_write(iocb, from); - else + if (ret == -EREMCHG) + goto buffered; + } else { +buffered: ret = xfs_file_buffered_aio_write(iocb, from); + } if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -874,7 +774,7 @@ xfs_file_write_iter( #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE) + FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long xfs_file_fallocate( @@ -964,9 +864,15 @@ xfs_file_fallocate( if (mode & FALLOC_FL_ZERO_RANGE) error = xfs_zero_file_space(ip, offset, len); - else + else { + if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + } error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + } if (error) goto out_unlock; } @@ -984,7 +890,7 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_setattr_size(ip, &iattr); + error = xfs_vn_setattr_size(file_dentry(file), &iattr); if (error) goto out_unlock; } @@ -1003,6 +909,61 @@ out_unlock: return error; } +STATIC ssize_t +xfs_file_copy_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len, + unsigned int flags) +{ + int error; + + error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); + if (error) + return error; + return len; +} + +STATIC int +xfs_file_clone_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( + struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > XFS_MAX_DEDUPE_LEN) + len = XFS_MAX_DEDUPE_LEN; + + error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} STATIC int xfs_file_open( @@ -1513,7 +1474,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1547,7 +1508,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1652,7 +1613,7 @@ const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, - .splice_read = xfs_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT @@ -1662,7 +1623,11 @@ const struct file_operations xfs_file_operations = { .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, + .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .copy_file_range = xfs_file_copy_range, + .clone_file_range = xfs_file_clone_range, + .dedupe_file_range = xfs_file_dedupe_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 4a33a33..043ca380 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -30,6 +30,7 @@ #include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_ag_resv.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -198,7 +199,8 @@ xfs_filestream_pick_ag( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || @@ -369,7 +371,8 @@ xfs_filestream_new_ag( struct xfs_mount *mp = ip->i_mount; xfs_extlen_t minlen = ap->length; xfs_agnumber_t startag = 0; - int flags, err = 0; + int flags = 0; + int err = 0; struct xfs_mru_cache_elem *mru; *agp = NULLAGNUMBER; @@ -385,8 +388,10 @@ xfs_filestream_new_ag( startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); + if (xfs_alloc_is_userdata(ap->datatype)) + flags |= XFS_PICK_USERDATA; + if (ap->dfops->dop_low) + flags |= XFS_PICK_LOWSPACE; err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0b7f986..93d12fa 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -43,6 +43,7 @@ #include "xfs_log.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" /* * File system operations @@ -108,7 +109,9 @@ xfs_fs_geometry( (xfs_sb_version_hassparseinodes(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | + (xfs_sb_version_hasreflink(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_REFLINK : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -259,6 +262,12 @@ xfs_growfs_data_private( agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -450,6 +459,17 @@ xfs_growfs_data_private( rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32( + xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -507,6 +527,28 @@ xfs_growfs_data_private( goto error0; } + /* + * refcount btree root block + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_refcountbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -553,7 +595,7 @@ xfs_growfs_data_private( error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), - new, &oinfo); + new, &oinfo, XFS_AG_RESV_NONE); if (error) goto error0; } @@ -589,6 +631,11 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* Reserve AG metadata blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out; + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = 0; @@ -639,6 +686,8 @@ xfs_growfs_data_private( continue; } } + + out: return saved_error ? saved_error : error; error0: @@ -948,3 +997,59 @@ xfs_do_force_shutdown( "Please umount the filesystem and rectify the problem(s)"); } } + +/* + * Reserve free space for per-AG metadata. + */ +int +xfs_fs_reserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_init(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error && error != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving per-AG metadata reserve pool.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + + return error; +} + +/* + * Free space reserved for per-AG metadata. + */ +int +xfs_fs_unreserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error) + xfs_warn(mp, + "Error %d freeing per-AG metadata reserve pool.", error); + + return error; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f..f349158 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); + #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d41b24..687a4b0 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,8 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer, which is measured in - * seconds. + * 100ths of a second) with the exception of eofb_timer and cowb_timer, which + * are measured in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -42,6 +42,7 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, + .cowb_timer = { 1, 1800, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fb39a66..f295049 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,7 @@ #include "xfs_bmap_util.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -76,6 +77,9 @@ xfs_inode_alloc( ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; + ip->i_cowfp = NULL; + ip->i_cnextents = 0; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -101,6 +105,8 @@ xfs_inode_free_callback( if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); + if (ip->i_cowfp) + xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); @@ -787,6 +793,33 @@ xfs_eofblocks_worker( xfs_queue_eofblocks(mp); } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +STATIC void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + xfs_icache_free_cowblocks(mp, NULL); + xfs_queue_cowblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks( return ret; } -int -xfs_icache_free_eofblocks( +static int +__xfs_icache_free_eofblocks( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_eofblocks *eofb, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int tag) { int flags = SYNC_TRYLOCK; if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) flags = SYNC_WAIT; - return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - eofb, XFS_ICI_EOFBLOCKS_TAG); + return xfs_inode_ag_iterator_tag(mp, execute, flags, + eofb, tag); +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); } /* @@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks( * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) +static int +__xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip, + int (*execute)(struct xfs_mount *mp, + struct xfs_eofblocks *eofb)) { int scan = 0; struct xfs_eofblocks eofb = {0}; @@ -1401,41 +1448,58 @@ xfs_inode_free_quota_eofblocks( } if (scan) - xfs_icache_free_eofblocks(ip->i_mount, &eofb); + execute(ip->i_mount, &eofb); return scan; } -void -xfs_inode_set_eofblocks_tag( - xfs_inode_t *ip) +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); +} + +static void +__xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip, + void (*execute)(struct xfs_mount *mp), + void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; int tagged; + /* + * Don't bother locking the AG and looking up in the radix trees + * if we already know that we have the tag set. + */ + if (ip->i_flags & XFS_IEOFBLOCKS) + return; + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_set_eofblocks_tag(ip); - tagged = radix_tree_tagged(&pag->pag_ici_root, - XFS_ICI_EOFBLOCKS_TAG); + tagged = radix_tree_tagged(&pag->pag_ici_root, tag); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); if (!tagged) { /* propagate the eofblocks tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_queue_eofblocks(ip->i_mount); + execute(ip->i_mount); - trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1443,31 +1507,166 @@ xfs_inode_set_eofblocks_tag( } void -xfs_inode_clear_eofblocks_tag( +xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); +} + +static void +__xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip, + void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) +{ struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_clear_eofblocks_tag(ip); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { /* clear the eofblocks tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_reflink_has_real_cow_blocks(ip)) { + trace_xfs_inode_free_cowblocks_invalid(ip); + xfs_inode_clear_cowblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty or under writeback we cannot touch the + * CoW fork. Leave it alone if we're in the midst of a directio. + */ + if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || + atomic_read(&VFS_I(ip)->i_dio_count)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + /* Free the CoW blocks */ + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + } + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + + if (need_iolock) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return ret; +} + +int +xfs_icache_free_cowblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +int +xfs_inode_free_quota_cowblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); +} + +void +xfs_inode_set_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_set_cowblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + trace_xfs_perag_set_cowblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +void +xfs_inode_clear_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_cowblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 05bac99..a1e02f4 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,6 +40,7 @@ struct xfs_eofblocks { in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ +#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ /* * Flags for xfs_iget() @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); +void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); +void xfs_cowblocks_worker(struct work_struct *); + int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e08eaea..4e560e6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" kmem_zone_t *xfs_inode_zone; @@ -77,6 +78,29 @@ xfs_get_extsz_hint( } /* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_d.di_cowextsize; + b = xfs_get_extsz_hint(ip); + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} + +/* * These two are wrapper routines around the xfs_ilock() routine used to * centralize some grungy code. They are used in places that wish to lock the * inode solely for reading the extents. The reason these places can't just @@ -651,6 +675,8 @@ _xfs_dic2xflags( if (di_flags2 & XFS_DIFLAG2_ANY) { if (di_flags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; + if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; } if (has_attr) @@ -821,7 +847,7 @@ xfs_ialloc( ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - tv = current_fs_time(mp->m_super); + tv = current_time(inode); inode->i_mtime = tv; inode->i_atime = tv; inode->i_ctime = tv; @@ -834,6 +860,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { inode->i_version = 1; ip->i_d.di_flags2 = 0; + ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; } @@ -896,6 +923,15 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; ip->i_d.di_flags2 |= di_flags2; } + if (pip && + (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && + pip->i_d.di_version == 3 && + ip->i_d.di_version == 3) { + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + } /* FALLTHROUGH */ case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -1586,6 +1622,20 @@ xfs_itruncate_extents( goto out; } + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, + last_block); + if (error) + goto out; + + /* + * Clear the reflink flag if we truncated everything. + */ + if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + } + /* * Always re-log the inode so that our permanent transaction can keep * on rolling it forward in the log. @@ -1710,7 +1760,7 @@ xfs_inactive_truncate( /* * Log the inode size first to prevent stale data exposure in the event * of a system crash before the truncate completes. See the related - * comment in xfs_setattr_size() for details. + * comment in xfs_vn_setattr_size() for details. */ ip->i_d.di_size = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1850,6 +1900,7 @@ xfs_inactive( } mp = ip->i_mount; + ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e1a411e..f14c1de 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -47,6 +47,7 @@ typedef struct xfs_inode { /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t *i_cowfp; /* copy on write extents */ xfs_ifork_t i_df; /* data fork */ /* operations vectors */ @@ -65,6 +66,9 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ + xfs_extnum_t i_cnextents; /* # of extents in cow fork */ + unsigned int i_cformat; /* format of cow fork */ + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; @@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp) return XFS_PROJID_DEFAULT; } +static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +} + /* * In-core inode flags. */ @@ -216,6 +225,13 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ +#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +/* + * If this unlinked inode is in the middle of recovery, don't let drop_inode + * truncate and free the inode. This can happen if we iget the inode during + * log recovery to replay a bmap operation on the inode. + */ +#define XFS_IRECOVERY (1 << 11) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -410,6 +426,7 @@ int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, @@ -473,4 +490,7 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* The default CoW extent size hint. */ +#define XFS_DEFAULT_COWEXTSZ_HINT 32 + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 892c2ac..9610e9c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -368,7 +368,7 @@ xfs_inode_to_log_dinode( to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; - + to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 96a70fd..c245bed 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -720,7 +720,7 @@ xfs_ioc_space( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = bf->l_start; - error = xfs_setattr_size(ip, &iattr); + error = xfs_vn_setattr_size(file_dentry(filp), &iattr); break; default: ASSERT(0); @@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_cowextsize = ip->i_d.di_cowextsize << + ip->i_mount->m_sb.sb_blocklog; fa.fsx_projid = xfs_get_projid(ip); if (attr) { @@ -973,12 +975,13 @@ xfs_set_diflags( if (ip->i_d.di_version < 3) return; - di_flags2 = 0; + di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_flags2 = di_flags2; - } STATIC void @@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags( return -EINVAL; } + /* Clear reflink if we are actually able to set the rt flag. */ + if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + + /* Don't allow us to set DAX mode for a reflinked file for now. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) + return -EINVAL; + /* * Can't modify an immutable/append-only file unless * we have appropriate permission. @@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize( return 0; } +/* + * CoW extent size hint validation rules are: + * + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. CoW extsize hint of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. + */ +static int +xfs_ioctl_setattr_check_cowextsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) + return 0; + + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || + ip->i_d.di_version != 3) + return -EINVAL; + + if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) + return -EINVAL; + + if (fa->fsx_cowextsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t cowextsize_fsb; + + cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + if (cowextsize_fsb > MAXEXTLEN) + return -EINVAL; + + size = mp->m_sb.sb_blocksize; + if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + + if (fa->fsx_cowextsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, @@ -1311,6 +1372,10 @@ xfs_ioctl_setattr( if (code) goto error_trans_cancel; + code = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (code) + goto error_trans_cancel; + code = xfs_ioctl_setattr_xflags(tp, ip, fa); if (code) goto error_trans_cancel; @@ -1346,6 +1411,12 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; + if (ip->i_d.di_version == 3 && + (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + ip->i_d.di_cowextsize = fa->fsx_cowextsize >> + mp->m_sb.sb_blocklog; + else + ip->i_d.di_cowextsize = 0; code = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2af0dda..436e109 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2016 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -38,21 +39,45 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_iomap_eof_align_last_fsb( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_extlen_t extsize, - xfs_fileoff_t *last_fsb) +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) { - xfs_extlen_t align = 0; - int eof, error; + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +xfs_extlen_t +xfs_eof_alignment( + struct xfs_inode *ip, + xfs_extlen_t extsize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t align = 0; if (!XFS_IS_REALTIME_INODE(ip)) { /* @@ -83,8 +108,21 @@ xfs_iomap_eof_align_last_fsb( align = extsize; } + return align; +} + +STATIC int +xfs_iomap_eof_align_last_fsb( + struct xfs_inode *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_extlen_t align = xfs_eof_alignment(ip, extsize); + if (align) { xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); + int eof, error; + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; @@ -154,7 +192,7 @@ xfs_iomap_write_direct( */ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & XFS_IFEXTENTS); - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb); if (error) goto out_unlock; } else { @@ -274,130 +312,6 @@ out_trans_cancel: goto out_unlock; } -/* - * If the caller is doing a write at the end of the file, then extend the - * allocation out to the file system's write iosize. We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * If we find we already have delalloc preallocation beyond EOF, don't do more - * preallocation as it it not needed. - */ -STATIC int -xfs_iomap_eof_want_preallocate( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *imap, - int nimaps, - int *prealloc) -{ - xfs_fileoff_t start_fsb; - xfs_filblks_t count_fsb; - int n, error, imaps; - int found_delalloc = 0; - - *prealloc = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - /* - * If the file is smaller than the minimum prealloc and we are using - * dynamic preallocation, don't do any preallocation at all as it is - * likely this is the only write to the file that is going to be done. - */ - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) - return 0; - - /* - * If there are any real blocks past eof, then don't - * do any speculative allocation. - */ - start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - while (count_fsb > 0) { - imaps = nimaps; - error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, - 0); - if (error) - return error; - for (n = 0; n < imaps; n++) { - if ((imap[n].br_startblock != HOLESTARTBLOCK) && - (imap[n].br_startblock != DELAYSTARTBLOCK)) - return 0; - start_fsb += imap[n].br_blockcount; - count_fsb -= imap[n].br_blockcount; - - if (imap[n].br_startblock == DELAYSTARTBLOCK) - found_delalloc = 1; - } - } - if (!found_delalloc) - *prealloc = 1; - return 0; -} - -/* - * Determine the initial size of the preallocation. We are beyond the current - * EOF here, but we need to take into account whether this is a sparse write or - * an extending write when determining the preallocation size. Hence we need to - * look up the extent that ends at the current write offset and use the result - * to determine the preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceeding data extent as the basis for the - * preallocation size. If the size of the extent is greater than half the - * maximum extent length, then use the current offset as the basis. This ensures - * that for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width alignment of - * real extents. - */ -STATIC xfs_fsblock_t -xfs_iomap_eof_prealloc_initial_size( - struct xfs_mount *mp, - struct xfs_inode *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - int nimaps) -{ - xfs_fileoff_t start_fsb; - int imaps = 1; - int error; - - ASSERT(nimaps >= imaps); - - /* if we are using a specific prealloc size, return now */ - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - return 0; - - /* If the file is small, then use the minimum prealloc */ - if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) - return 0; - - /* - * As we write multiple pages, the offset will always align to the - * start of a page and hence point to a hole at EOF. i.e. if the size is - * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) - * will return FSB 1. Hence if there are blocks in the file, we want to - * point to the block prior to the EOF block and not the hole that maps - * directly at @offset. - */ - start_fsb = XFS_B_TO_FSB(mp, offset); - if (start_fsb) - start_fsb--; - error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); - if (error) - return 0; - - ASSERT(imaps == 1); - if (imap[0].br_startblock == HOLESTARTBLOCK) - return 0; - if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount << 1; - return XFS_B_TO_FSB(mp, offset); -} - STATIC bool xfs_quota_need_throttle( struct xfs_inode *ip, @@ -459,27 +373,76 @@ xfs_quota_calc_throttle( } /* + * If we are doing a write at the end of the file and there are no allocations + * past this one, then extend the allocation out to the file system's write + * iosize. + * * If we don't have a user specified preallocation size, dynamically increase - * the preallocation size as the size of the file grows. Cap the maximum size + * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the * filesystem is to full, the smaller the maximum prealocation. + * + * As an exception we don't do any preallocation at all if the file is smaller + * than the minimum preallocation and we are using the default dynamic + * preallocation scheme, as it is likely this is the only write to the file that + * is going to be done. + * + * We clean up any extra space left over when the file is closed in + * xfs_inactive(). */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( - struct xfs_mount *mp, struct xfs_inode *ip, - xfs_off_t offset, - struct xfs_bmbt_irec *imap, - int nimaps) + loff_t offset, + loff_t count, + xfs_extnum_t idx, + struct xfs_bmbt_irec *prev) { - xfs_fsblock_t alloc_blocks = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; int qshift = 0; + xfs_fsblock_t alloc_blocks = 0; - alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, - imap, nimaps); + if (offset + count <= XFS_ISIZE(ip)) + return 0; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) + return 0; + + /* + * If an explicit allocsize is set, the file is small, or we + * are writing behind a hole, then use the minimum prealloc: + */ + if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + idx == 0 || + prev->br_startoff + prev->br_blockcount < offset_fsb) + return mp->m_writeio_blocks; + + /* + * Determine the initial size of the preallocation. We are beyond the + * current EOF here, but we need to take into account whether this is + * a sparse write or an extending write when determining the + * preallocation size. Hence we need to look up the extent that ends + * at the current write offset and use the result to determine the + * preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceding data extent as the basis + * for the preallocation size. If the size of the extent is greater than + * half the maximum extent length, then use the current offset as the + * basis. This ensures that for large files the preallocation size + * always extends to MAXEXTLEN rather than falling short due to things + * like stripe unit/width alignment of real extents. + */ + if (prev->br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev->br_blockcount << 1; + else + alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) goto check_writeio; qblocks = alloc_blocks; @@ -550,120 +513,156 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; - check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; - trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, mp->m_writeio_blocks); - return alloc_blocks; } -int -xfs_iomap_write_delay( - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *ret_imap) +static int +xfs_file_iomap_begin_delay( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_off_t aligned_offset; - xfs_fileoff_t ioalign; - xfs_extlen_t extsz; - int nimaps; - xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc; - int error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t maxbytes_fsb = + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + xfs_fileoff_t end_fsb, orig_end_fsb; + int error = 0, eof = 0; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_extnum_t idx; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + ASSERT(!xfs_get_extsz_hint(ip)); - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); - if (error) - return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); - extsz = xfs_get_extsz_hint(ip); - offset_fsb = XFS_B_TO_FSBT(mp, offset); + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_unlock; + } - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, - imap, XFS_WRITE_IMAPS, &prealloc); - if (error) - return error; + XFS_STATS_INC(mp, xs_blk_mapw); -retry: - if (prealloc) { - xfs_fsblock_t alloc_blocks; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } - alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, - XFS_WRITE_IMAPS); + xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= offset_fsb) { + if (xfs_is_reflink_inode(ip)) { + bool shared; - aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); - ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + alloc_blocks; - } else { - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - } + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), + maxbytes_fsb); + xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); + error = xfs_reflink_reserve_cow(ip, &got, &shared); + if (error) + goto out_unlock; + } - if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); - if (error) - return error; + trace_xfs_iomap_found(ip, offset, count, 0, &got); + goto done; } + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + /* - * Make sure preallocation does not create extents beyond the range we - * actually support in this filesystem. + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages + * to keep the chunks of work done where somewhat symmetric with the + * work writeback does. This is a completely arbitrary number pulled + * out of thin air as a best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = orig_end_fsb = + min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (eof) { + xfs_fsblock_t prealloc_blocks; + + prealloc_blocks = + xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + if (prealloc_blocks) { + xfs_extlen_t align; + xfs_off_t end_offset; - ASSERT(last_fsb > offset_fsb); + end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); + end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; + + align = xfs_eof_alignment(ip, 0); + if (align) + end_fsb = roundup_64(end_fsb, align); + + end_fsb = min(end_fsb, maxbytes_fsb); + ASSERT(end_fsb > offset_fsb); + } + } - nimaps = XFS_WRITE_IMAPS; - error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, - imap, &nimaps, XFS_BMAPI_ENTIRE); +retry: + error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, + end_fsb - offset_fsb, &got, + &prev, &idx, eof); switch (error) { case 0: + break; case -ENOSPC: case -EDQUOT: - break; - default: - return error; - } - - /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry - * without EOF preallocation. - */ - if (nimaps == 0) { + /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc) { - prealloc = 0; - error = 0; + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; goto retry; } - return error ? error : -ENOSPC; + /*FALLTHRU*/ + default: + goto out_unlock; } - if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, &imap[0]); - /* * Tag the inode as speculatively preallocated so we can reclaim this * space on demand, if necessary. */ - if (prealloc) + if (end_fsb != orig_end_fsb) xfs_inode_set_eofblocks_tag(ip); - *ret_imap = imap[0]; - return 0; + trace_xfs_iomap_alloc(ip, offset, count, 0, &got); +done: + if (isnullstartblock(got.br_startblock)) + got.br_startblock = DELAYSTARTBLOCK; + + if (!got.br_startblock) { + error = xfs_alert_fsblock_zero(ip, &got); + if (error) + goto out_unlock; + } + + xfs_bmbt_to_iomap(ip, iomap, &got); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* @@ -679,6 +678,7 @@ retry: int xfs_iomap_write_allocate( xfs_inode_t *ip, + int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap) { @@ -691,8 +691,12 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; + int flags = 0; int nres; + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK; + /* * Make sure that the dquots are there. */ @@ -786,7 +790,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, + count_fsb, flags, &first_block, nres, imap, &nimaps, &dfops); if (error) @@ -947,37 +951,13 @@ error_on_bmapi_transaction: return error; } -void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); -} - -static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool imap_needs_alloc(struct inode *inode, + struct xfs_bmbt_irec *imap, int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || - imap->br_startblock == DELAYSTARTBLOCK; + imap->br_startblock == DELAYSTARTBLOCK || + (IS_DAX(inode) && ISUNWRITTEN(imap)); } static int @@ -993,11 +973,29 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; + bool shared = false, trimmed = false; + unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && + !xfs_get_extsz_hint(ip)) { + /* Reserve delalloc blocks for regular writeback. */ + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); + } + + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -1006,13 +1004,28 @@ xfs_file_iomap_begin( end_fsb = XFS_B_TO_FSB(mp, offset + length); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + &nimaps, 0); + if (error) + goto out_unlock; + + if (flags & IOMAP_REPORT) { + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) + goto out_unlock; + } + + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + end_fsb = imap.br_startoff + imap.br_blockcount; + length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric @@ -1024,32 +1037,33 @@ xfs_file_iomap_begin( * the lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); - if (xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - xfs_ilock_demote(ip, XFS_ILOCK_EXCL); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - } else { - error = xfs_iomap_write_delay(ip, offset, length, &imap); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); if (error) return error; + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { ASSERT(nimaps); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, 0, &imap); } xfs_bmbt_to_iomap(ip, iomap, &imap); + if (shared) + iomap->flags |= IOMAP_F_SHARED; return 0; +out_unlock: + xfs_iunlock(ip, lockmode); + return error; } static int diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fb8aca3..6d45cf0 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,14 +25,13 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *); -int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, +int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); extern struct iomap_ops xfs_iomap_ops; extern struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b24c310..405a65c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -542,6 +542,28 @@ xfs_setattr_time( inode->i_mtime = iattr->ia_mtime; } +static int +xfs_vn_change_ok( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + return setattr_prepare(dentry, iattr); +} + +/* + * Set non-size attributes of an inode. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine. + */ int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -558,21 +580,6 @@ xfs_setattr_nonsize( struct xfs_dquot *udqp = NULL, *gdqp = NULL; struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; - trace_xfs_setattr(ip); - - /* If acls are being inherited, we already have this checked */ - if (!(flags & XFS_ATTR_NOACL)) { - if (mp->m_flags & XFS_MOUNT_RDONLY) - return -EROFS; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - } - ASSERT((mask & ATTR_SIZE) == 0); /* @@ -743,8 +750,27 @@ out_dqrele: return error; } +int +xfs_vn_setattr_nonsize( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; + + trace_xfs_setattr(ip); + + error = xfs_vn_change_ok(dentry, iattr); + if (error) + return error; + return xfs_setattr_nonsize(ip, iattr, 0); +} + /* * Truncate file. Must have write permission and not be a directory. + * + * Caution: The caller of this function is responsible for calling + * setattr_prepare() or otherwise verifying the change is fine. */ int xfs_setattr_size( @@ -759,18 +785,6 @@ xfs_setattr_size( uint lock_flags = 0; bool did_zeroing = false; - trace_xfs_setattr(ip); - - if (mp->m_flags & XFS_MOUNT_RDONLY) - return -EROFS; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - error = inode_change_ok(inode, iattr); - if (error) - return error; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(inode->i_mode)); @@ -882,7 +896,7 @@ xfs_setattr_size( if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_fs_time(inode->i_sb); + current_time(inode); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } @@ -942,16 +956,32 @@ out_trans_cancel: goto out_unlock; } +int +xfs_vn_setattr_size( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; + + trace_xfs_setattr(ip); + + error = xfs_vn_change_ok(dentry, iattr); + if (error) + return error; + return xfs_setattr_size(ip, iattr); +} + STATIC int xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error; if (iattr->ia_valid & ATTR_SIZE) { - uint iolock = XFS_IOLOCK_EXCL; + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + uint iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); error = xfs_break_layouts(d_inode(dentry), &iolock, true); @@ -959,11 +989,11 @@ xfs_vn_setattr( xfs_ilock(ip, XFS_MMAPLOCK_EXCL); iolock |= XFS_MMAPLOCK_EXCL; - error = xfs_setattr_size(ip, iattr); + error = xfs_vn_setattr_size(dentry, iattr); } xfs_iunlock(ip, iolock); } else { - error = xfs_setattr_nonsize(ip, iattr, 0); + error = xfs_vn_setattr_nonsize(dentry, iattr); } return error; @@ -1036,9 +1066,6 @@ static const struct inode_operations xfs_inode_operations = { .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .fiemap = xfs_vn_fiemap, .update_time = xfs_vn_update_time, @@ -1059,14 +1086,11 @@ static const struct inode_operations xfs_dir_inode_operations = { */ .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, - .rename2 = xfs_vn_rename, + .rename = xfs_vn_rename, .get_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, .tmpfile = xfs_vn_tmpfile, @@ -1087,14 +1111,11 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { */ .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, - .rename2 = xfs_vn_rename, + .rename = xfs_vn_rename, .get_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, .tmpfile = xfs_vn_tmpfile, @@ -1105,9 +1126,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { .get_link = xfs_vn_get_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, }; @@ -1117,9 +1135,6 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = { .get_link = xfs_vn_get_link_inline, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .removexattr = generic_removexattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, }; @@ -1144,6 +1159,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && + !xfs_is_reflink_inode(ip) && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index a0f84ab..0259a38 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -33,6 +33,7 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); -extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); +extern int xfs_vn_setattr_nonsize(struct dentry *dentry, struct iattr *vap); +extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ce73eb3..66e8817 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,7 +66,7 @@ xfs_bulkstat_one_int( if (!buffer || xfs_internal_inum(mp, ino)) return -EINVAL; - buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); if (!buf) return -ENOMEM; @@ -111,6 +111,12 @@ xfs_bulkstat_one_int( buf->bs_aextents = dic->di_anextents; buf->bs_forkoff = XFS_IFORK_BOFF(ip); + if (dic->di_version == 3) { + if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + buf->bs_cowextsize = dic->di_cowextsize << + mp->m_sb.sb_blocklog; + } + switch (dic->di_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = ip->i_df.if_u2.if_rdev; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index b8d64d5..68640fb 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,6 +116,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_eofb_secs xfs_params.eofb_timer.val +#define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 765f084..2b6eec5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -413,7 +413,8 @@ struct xlog { /* log record crc error injection factor */ uint32_t l_badcrc_factor; #endif - + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e8638fd..9b3d7c7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -44,6 +44,9 @@ #include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_rmap_item.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -381,6 +384,15 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } } + + /* + * On v5 supers, a bli could be attached to update the metadata LSN. + * Clean it up. + */ + if (bp->b_fspriv) + xfs_buf_item_relse(bp); + ASSERT(bp->b_fspriv == NULL); + bp->b_iodone = NULL; xfs_buf_ioend(bp); } @@ -1914,6 +1926,10 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2232,6 +2248,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2360,12 +2377,14 @@ static void xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; __uint32_t magic32; __uint16_t magic16; __uint16_t magicda; + char *warnmsg = NULL; /* * We can only do post recovery validation on items on CRC enabled @@ -2403,32 +2422,31 @@ xlog_recover_validate_buf_type( case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; default: - xfs_warn(mp, "Bad btree block magic!"); - ASSERT(0); + warnmsg = "Bad btree block magic!"; break; } break; case XFS_BLFT_AGF_BUF: if (magic32 != XFS_AGF_MAGIC) { - xfs_warn(mp, "Bad AGF block magic!"); - ASSERT(0); + warnmsg = "Bad AGF block magic!"; break; } bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: if (magic32 != XFS_AGFL_MAGIC) { - xfs_warn(mp, "Bad AGFL block magic!"); - ASSERT(0); + warnmsg = "Bad AGFL block magic!"; break; } bp->b_ops = &xfs_agfl_buf_ops; break; case XFS_BLFT_AGI_BUF: if (magic32 != XFS_AGI_MAGIC) { - xfs_warn(mp, "Bad AGI block magic!"); - ASSERT(0); + warnmsg = "Bad AGI block magic!"; break; } bp->b_ops = &xfs_agi_buf_ops; @@ -2438,8 +2456,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_GDQUOT_BUF: #ifdef CONFIG_XFS_QUOTA if (magic16 != XFS_DQUOT_MAGIC) { - xfs_warn(mp, "Bad DQUOT block magic!"); - ASSERT(0); + warnmsg = "Bad DQUOT block magic!"; break; } bp->b_ops = &xfs_dquot_buf_ops; @@ -2451,16 +2468,14 @@ xlog_recover_validate_buf_type( break; case XFS_BLFT_DINO_BUF: if (magic16 != XFS_DINODE_MAGIC) { - xfs_warn(mp, "Bad INODE block magic!"); - ASSERT(0); + warnmsg = "Bad INODE block magic!"; break; } bp->b_ops = &xfs_inode_buf_ops; break; case XFS_BLFT_SYMLINK_BUF: if (magic32 != XFS_SYMLINK_MAGIC) { - xfs_warn(mp, "Bad symlink block magic!"); - ASSERT(0); + warnmsg = "Bad symlink block magic!"; break; } bp->b_ops = &xfs_symlink_buf_ops; @@ -2468,8 +2483,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_BLOCK_BUF: if (magic32 != XFS_DIR2_BLOCK_MAGIC && magic32 != XFS_DIR3_BLOCK_MAGIC) { - xfs_warn(mp, "Bad dir block magic!"); - ASSERT(0); + warnmsg = "Bad dir block magic!"; break; } bp->b_ops = &xfs_dir3_block_buf_ops; @@ -2477,8 +2491,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_DATA_BUF: if (magic32 != XFS_DIR2_DATA_MAGIC && magic32 != XFS_DIR3_DATA_MAGIC) { - xfs_warn(mp, "Bad dir data magic!"); - ASSERT(0); + warnmsg = "Bad dir data magic!"; break; } bp->b_ops = &xfs_dir3_data_buf_ops; @@ -2486,8 +2499,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_FREE_BUF: if (magic32 != XFS_DIR2_FREE_MAGIC && magic32 != XFS_DIR3_FREE_MAGIC) { - xfs_warn(mp, "Bad dir3 free magic!"); - ASSERT(0); + warnmsg = "Bad dir3 free magic!"; break; } bp->b_ops = &xfs_dir3_free_buf_ops; @@ -2495,8 +2507,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAF1_BUF: if (magicda != XFS_DIR2_LEAF1_MAGIC && magicda != XFS_DIR3_LEAF1_MAGIC) { - xfs_warn(mp, "Bad dir leaf1 magic!"); - ASSERT(0); + warnmsg = "Bad dir leaf1 magic!"; break; } bp->b_ops = &xfs_dir3_leaf1_buf_ops; @@ -2504,8 +2515,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAFN_BUF: if (magicda != XFS_DIR2_LEAFN_MAGIC && magicda != XFS_DIR3_LEAFN_MAGIC) { - xfs_warn(mp, "Bad dir leafn magic!"); - ASSERT(0); + warnmsg = "Bad dir leafn magic!"; break; } bp->b_ops = &xfs_dir3_leafn_buf_ops; @@ -2513,8 +2523,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DA_NODE_BUF: if (magicda != XFS_DA_NODE_MAGIC && magicda != XFS_DA3_NODE_MAGIC) { - xfs_warn(mp, "Bad da node magic!"); - ASSERT(0); + warnmsg = "Bad da node magic!"; break; } bp->b_ops = &xfs_da3_node_buf_ops; @@ -2522,24 +2531,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_ATTR_LEAF_BUF: if (magicda != XFS_ATTR_LEAF_MAGIC && magicda != XFS_ATTR3_LEAF_MAGIC) { - xfs_warn(mp, "Bad attr leaf magic!"); - ASSERT(0); + warnmsg = "Bad attr leaf magic!"; break; } bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: if (magic32 != XFS_ATTR3_RMT_MAGIC) { - xfs_warn(mp, "Bad attr remote magic!"); - ASSERT(0); + warnmsg = "Bad attr remote magic!"; break; } bp->b_ops = &xfs_attr3_rmt_buf_ops; break; case XFS_BLFT_SB_BUF: if (magic32 != XFS_SB_MAGIC) { - xfs_warn(mp, "Bad SB block magic!"); - ASSERT(0); + warnmsg = "Bad SB block magic!"; break; } bp->b_ops = &xfs_sb_buf_ops; @@ -2556,6 +2562,40 @@ xlog_recover_validate_buf_type( xfs_blft_from_flags(buf_f)); break; } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_item_init(bp, mp); + bip = bp->b_fspriv; + bip->bli_item.li_lsn = current_lsn; + } } /* @@ -2569,7 +2609,8 @@ xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { int i; int bit; @@ -2642,7 +2683,7 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recover_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } /* @@ -2685,7 +2726,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return false; - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); return true; } @@ -2773,7 +2814,8 @@ xlog_recover_buffer_pass2( */ lsn = xlog_recover_get_buf_lsn(mp, bp); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - xlog_recover_validate_buf_type(mp, bp, buf_f); + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); goto out_release; } @@ -2789,7 +2831,7 @@ xlog_recover_buffer_pass2( if (!dirty) goto out_release; } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* @@ -3515,6 +3557,242 @@ xlog_recover_rud_pass2( } /* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The CUI has two references. One for the CUD and one for CUI to ensure + * it makes it into the AIL. Insert the CUI into the AIL directly and + * drop the CUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_cud_log_format *cud_formatp; + struct xfs_cui_log_item *cuip = NULL; + struct xfs_log_item *lip; + __uint64_t cui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + return -EFSCORRUPTED; + cui_id = cud_formatp->cud_cui_id; + + /* + * Search for the CUI with the id in the CUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_CUI) { + cuip = (struct xfs_cui_log_item *)lip; + if (cuip->cui_format.cui_id == cui_id) { + /* + * Drop the CUD reference to the CUI. This + * removes the CUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + return -EFSCORRUPTED; + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_bud_log_format *bud_formatp; + struct xfs_bui_log_item *buip = NULL; + struct xfs_log_item *lip; + __uint64_t bui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + return -EFSCORRUPTED; + bui_id = bud_formatp->bud_bui_id; + + /* + * Search for the BUI with the id in the BUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_BUI) { + buip = (struct xfs_bui_log_item *)lip; + if (buip->bui_format.bui_id == bui_id) { + /* + * Drop the BUD reference to the BUI. This + * removes the BUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that @@ -3741,6 +4019,10 @@ xlog_recover_ra_pass2( case XFS_LI_QUOTAOFF: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: default: break; } @@ -3766,6 +4048,10 @@ xlog_recover_commit_pass1( case XFS_LI_ICREATE: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: /* nothing to do in pass 1 */ return 0; default: @@ -3800,6 +4086,14 @@ xlog_recover_commit_pass2( return xlog_recover_rui_pass2(log, item, trans->r_lsn); case XFS_LI_RUD: return xlog_recover_rud_pass2(log, item); + case XFS_LI_CUI: + return xlog_recover_cui_pass2(log, item, trans->r_lsn); + case XFS_LI_CUD: + return xlog_recover_cud_pass2(log, item); + case XFS_LI_BUI: + return xlog_recover_bui_pass2(log, item, trans->r_lsn); + case XFS_LI_BUD: + return xlog_recover_bud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -3846,14 +4140,13 @@ STATIC int xlog_recover_commit_trans( struct xlog *log, struct xlog_recover *trans, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; - int error2; int items_queued = 0; struct xlog_recover_item *item; struct xlog_recover_item *next; - LIST_HEAD (buffer_list); LIST_HEAD (ra_list); LIST_HEAD (done_list); @@ -3876,7 +4169,7 @@ xlog_recover_commit_trans( items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); items_queued = 0; } @@ -3894,15 +4187,14 @@ out: if (!list_empty(&ra_list)) { if (!error) error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); } if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); - error2 = xfs_buf_delwri_submit(&buffer_list); - return error ? error : error2; + return error; } STATIC void @@ -4085,7 +4377,8 @@ xlog_recovery_process_trans( char *dp, unsigned int len, unsigned int flags, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; bool freeit = false; @@ -4109,7 +4402,8 @@ xlog_recovery_process_trans( error = xlog_recover_add_to_cont_trans(log, trans, dp, len); break; case XLOG_COMMIT_TRANS: - error = xlog_recover_commit_trans(log, trans, pass); + error = xlog_recover_commit_trans(log, trans, pass, + buffer_list); /* success or fail, we are now done with this transaction. */ freeit = true; break; @@ -4191,10 +4485,12 @@ xlog_recover_process_ophdr( struct xlog_op_header *ohead, char *dp, char *end, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_recover *trans; unsigned int len; + int error; /* Do we understand who wrote this op? */ if (ohead->oh_clientid != XFS_TRANSACTION && @@ -4221,8 +4517,39 @@ xlog_recover_process_ophdr( return 0; } + /* + * The recovered buffer queue is drained only once we know that all + * recovery items for the current LSN have been processed. This is + * required because: + * + * - Buffer write submission updates the metadata LSN of the buffer. + * - Log recovery skips items with a metadata LSN >= the current LSN of + * the recovery item. + * - Separate recovery items against the same metadata buffer can share + * a current LSN. I.e., consider that the LSN of a recovery item is + * defined as the starting LSN of the first record in which its + * transaction appears, that a record can hold multiple transactions, + * and/or that a transaction can span multiple records. + * + * In other words, we are allowed to submit a buffer from log recovery + * once per current LSN. Otherwise, we may incorrectly skip recovery + * items and cause corruption. + * + * We don't know up front whether buffers are updated multiple times per + * LSN. Therefore, track the current LSN of each commit log record as it + * is processed and drain the queue when it changes. Use commit records + * because they are ordered correctly by the logging code. + */ + if (log->l_recovery_lsn != trans->r_lsn && + ohead->oh_flags & XLOG_COMMIT_TRANS) { + error = xfs_buf_delwri_submit(buffer_list); + if (error) + return error; + log->l_recovery_lsn = trans->r_lsn; + } + return xlog_recovery_process_trans(log, trans, dp, len, - ohead->oh_flags, pass); + ohead->oh_flags, pass, buffer_list); } /* @@ -4240,7 +4567,8 @@ xlog_recover_process_data( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_op_header *ohead; char *end; @@ -4254,6 +4582,7 @@ xlog_recover_process_data( if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; + trace_xfs_log_recover_record(log, rhead, pass); while ((dp < end) && num_logops) { ohead = (struct xlog_op_header *)dp; @@ -4262,7 +4591,7 @@ xlog_recover_process_data( /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, - dp, end, pass); + dp, end, pass, buffer_list); if (error) return error; @@ -4352,12 +4681,94 @@ xlog_recover_cancel_rui( spin_lock(&ailp->xa_lock); } +/* Recover the CUI if necessary. */ +STATIC int +xlog_recover_process_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + int error; + + /* + * Skip CUIs that we've already processed. + */ + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_cui_recover(mp, cuip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the CUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the BUI if necessary. */ +STATIC int +xlog_recover_process_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + int error; + + /* + * Skip BUIs that we've already processed. + */ + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_bui_recover(mp, buip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the BUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); +} + /* Is this log item a deferred action intent? */ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { switch (lip->li_type) { case XFS_LI_EFI: case XFS_LI_RUI: + case XFS_LI_CUI: + case XFS_LI_BUI: return true; default: return false; @@ -4421,6 +4832,12 @@ xlog_recover_process_intents( case XFS_LI_RUI: error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + error = xlog_recover_process_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + error = xlog_recover_process_bui(log->l_mp, ailp, lip); + break; } if (error) goto out; @@ -4468,6 +4885,12 @@ xlog_recover_cancel_intents( case XFS_LI_RUI: xlog_recover_cancel_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + xlog_recover_cancel_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + xlog_recover_cancel_bui(log->l_mp, ailp, lip); + break; } lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4546,6 +4969,7 @@ xlog_recover_process_one_iunlink( if (error) goto fail_iput; + xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); @@ -4685,7 +5109,8 @@ xlog_recover_process( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { int error; __le32 crc; @@ -4732,7 +5157,8 @@ xlog_recover_process( if (error) return error; - return xlog_recover_process_data(log, rhash, rhead, dp, pass); + return xlog_recover_process_data(log, rhash, rhead, dp, pass, + buffer_list); } STATIC int @@ -4793,9 +5219,11 @@ xlog_do_recovery_pass( char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size, h_len; + int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; + LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); rhead_blk = 0; @@ -4981,7 +5409,7 @@ xlog_do_recovery_pass( } error = xlog_recover_process(log, rhash, rhead, offset, - pass); + pass, &buffer_list); if (error) goto bread_err2; @@ -5012,7 +5440,8 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_recover_process(log, rhash, rhead, offset, pass); + error = xlog_recover_process(log, rhash, rhead, offset, pass, + &buffer_list); if (error) goto bread_err2; @@ -5025,10 +5454,17 @@ xlog_do_recovery_pass( bread_err1: xlog_put_bp(hbp); + /* + * Submit buffers that have been added from the last record processed, + * regardless of error status. + */ + if (!list_empty(&buffer_list)) + error2 = xfs_buf_delwri_submit(&buffer_list); + if (error && first_bad) *first_bad = rhead_blk; - return error; + return error ? error : error2; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index faeead6..b341f10 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_reflink.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -684,6 +686,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); xfs_rmapbt_compute_maxlevels(mp); + xfs_refcountbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -923,6 +926,15 @@ xfs_mountfs( } /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. + */ + mp->m_super->s_flags |= MS_ACTIVE; + + /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -934,6 +946,20 @@ xfs_mountfs( } /* + * Now the log is fully replayed, we can transition to full read-only + * mode for read-only mounts. This will sync all the metadata and clean + * the log so that the recovery we just performed does not have to be + * replayed again on the next mount. + * + * We use the same quiesce mechanism as the rw->ro remount, as they are + * semantically identical operations. + */ + if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == + XFS_MOUNT_RDONLY) { + xfs_quiesce_attr(mp); + } + + /* * Complete the quota initialisation, post-log-replay component. */ if (quotamount) { @@ -960,11 +986,30 @@ xfs_mountfs( if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_quota; + } + + /* Reserve AG blocks for future btree expansion. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out_agresv; } return 0; + out_agresv: + xfs_fs_unreserve_ag_blocks(mp); + out_quota: + xfs_qm_unmount_quotas(mp); out_rtunmount: + mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); @@ -1005,7 +1050,9 @@ xfs_unmountfs( int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); + xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b36676c..819b80b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,10 +57,16 @@ enum { #define XFS_ERR_RETRY_FOREVER -1 +/* + * Although retry_timeout is in jiffies which is normally an unsigned long, + * we limit the retry timeout to 86400 seconds, or one day. So even a + * signed 32-bit long is sufficient for a HZ value up to 24855. Making it + * signed lets us store the special "-1" value, meaning retry forever. + */ struct xfs_error_cfg { struct xfs_kobj kobj; int max_retries; - unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ + long retry_timeout; /* in jiffies, -1 = infinite */ }; typedef struct xfs_mount { @@ -118,10 +124,13 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ + uint m_refc_mxr[2]; /* max refc btree records */ + uint m_refc_mnr[2]; /* min refc btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ uint m_rmap_maxlevels; /* max rmap btree levels */ + uint m_refc_maxlevels; /* max refcount btree level */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ @@ -155,6 +164,8 @@ typedef struct xfs_mount { struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ + struct delayed_work m_cowblocks_work; /* background cow blocks + trimming */ bool m_update_sb; /* sb needs update in mount */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -325,6 +336,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp) } #endif +/* per-AG block reservation data structures*/ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_AGFL, +}; + +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -372,8 +399,31 @@ typedef struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; int pagb_count; /* pagb slots in use */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for just AGFL-based metadata. */ + struct xfs_ag_resv pag_agfl_resv; + + /* reference count */ + __uint8_t pagf_refcount_level; } xfs_perag_t; +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_AGFL: + return &pag->pag_agfl_resv; + default: + return NULL; + } +} + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 69e2986..0c381d7 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); @@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0f14b2e..93a7aaf 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -114,6 +114,13 @@ xfs_fs_map_blocks( return -ENXIO; /* + * The pNFS block layout spec actually supports reflink like + * functionality, but the Linux pNFS server doesn't implement it yet. + */ + if (xfs_is_reflink_inode(ip)) + return -ENXIO; + + /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c new file mode 100644 index 0000000..fe86a66 --- /dev/null +++ b/fs/xfs/xfs_refcount_item.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_log.h" +#include "xfs_refcount.h" + + +kmem_zone_t *xfs_cui_zone; +kmem_zone_t *xfs_cud_zone; + +static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cui_log_item, cui_item); +} + +void +xfs_cui_item_free( + struct xfs_cui_log_item *cuip) +{ + if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) + kmem_free(cuip); + else + kmem_zone_free(xfs_cui_zone, cuip); +} + +STATIC void +xfs_cui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cui log item. We use only 1 iovec, and we point that + * at the cui_log_format structure embedded in the cui item. + * It is at this point that we assert that all of the extent + * slots in the cui item have been filled. + */ +STATIC void +xfs_cui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&cuip->cui_next_extent) == + cuip->cui_format.cui_nextents); + + cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, + xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents)); +} + +/* + * Pinning has no meaning for an cui item, so just return. + */ +STATIC void +xfs_cui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an CUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the CUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the CUI to either construct + * and commit the CUD or drop the CUD's reference in the event of error. Simply + * drop the log's CUI reference now that the log is done with it. + */ +STATIC void +xfs_cui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + xfs_cui_release(cuip); +} + +/* + * CUI items have no locking or pushing. However, since CUIs are pulled from + * the AIL when their corresponding CUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the CUI out of + * the AIL. + */ +STATIC uint +xfs_cui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an CUD isn't going to be + * constructed and thus we free the CUI here directly. + */ +STATIC void +xfs_cui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_cui_item_free(CUI_ITEM(lip)); +} + +/* + * The CUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_cui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The CUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cui log items. + */ +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_pin = xfs_cui_item_pin, + .iop_unpin = xfs_cui_item_unpin, + .iop_unlock = xfs_cui_item_unlock, + .iop_committed = xfs_cui_item_committed, + .iop_push = xfs_cui_item_push, + .iop_committing = xfs_cui_item_committing, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + KM_SLEEP); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} + +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + +static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cud_log_item, cud_item); +} + +STATIC void +xfs_cud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_cud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cud log item. We use only 1 iovec, and we point that + * at the cud_log_format structure embedded in the cud item. + * It is at this point that we assert that all of the extent + * slots in the cud item have been filled. + */ +STATIC void +xfs_cud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + cudp->cud_format.cud_type = XFS_LI_CUD; + cudp->cud_format.cud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, + sizeof(struct xfs_cud_log_format)); +} + +/* + * Pinning has no meaning for an cud item, so just return. + */ +STATIC void +xfs_cud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an cud item, unpinning does + * not either. + */ +STATIC void +xfs_cud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an cud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_cud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. + */ +STATIC void +xfs_cud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + } +} + +/* + * When the cud item is committed to disk, all we need to do is delete our + * reference to our partner cui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_cud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + /* + * Drop the CUI reference regardless of whether the CUD has been + * aborted. Once the CUD transaction is constructed, it is the sole + * responsibility of the CUD to release the CUI (even if the CUI is + * aborted due to log I/O error). + */ + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + + return (xfs_lsn_t)-1; +} + +/* + * The CUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cud log items. + */ +static const struct xfs_item_ops xfs_cud_item_ops = { + .iop_size = xfs_cud_item_size, + .iop_format = xfs_cud_item_format, + .iop_pin = xfs_cud_item_pin, + .iop_unpin = xfs_cud_item_unpin, + .iop_unlock = xfs_cud_item_unlock, + .iop_committed = xfs_cud_item_committed, + .iop_push = xfs_cud_item_push, + .iop_committing = xfs_cud_item_committing, +}; + +/* + * Allocate and initialize an cud item with the given number of extents. + */ +struct xfs_cud_log_item * +xfs_cud_init( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) + +{ + struct xfs_cud_log_item *cudp; + + cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return cudp; +} + +/* + * Process a refcount update intent item that was recovered from the log. + * We need to update the refcountbt. + */ +int +xfs_cui_recover( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) +{ + int i; + int error = 0; + unsigned int refc_type; + struct xfs_phys_extent *refc; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_cud_log_item *cudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + enum xfs_refcount_intent_type type; + xfs_fsblock_t firstfsb; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_len; + struct xfs_bmbt_irec irec; + struct xfs_defer_ops dfops; + bool requeue_only = false; + + ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + + /* + * First check the validity of the extents described by the + * CUI. If any are bad, then assume that all are bad and + * just toss the CUI. + */ + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, refc->pe_startblock)); + switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + refc->pe_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + refc->pe_len >= mp->m_sb.sb_agblocks || + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { + /* + * This will pull the CUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + xfs_cui_release(cuip); + return -EIO; + } + } + + /* + * Under normal operation, refcount updates are deferred, so we + * wouldn't be adding them directly to a transaction. All + * refcount updates manage reservation usage internally and + * dynamically by deferring work that won't fit in the + * transaction. Normally, any work that needs to be deferred + * gets attached to the same defer_ops that scheduled the + * refcount update. However, we're in log recovery here, so we + * we create our own defer_ops and use that to finish up any + * work that doesn't fit. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + cudp = xfs_trans_get_cud(tp, cuip); + + xfs_defer_init(&dfops, &firstfsb); + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + switch (refc_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + type = refc_type; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + if (requeue_only) { + new_fsb = refc->pe_startblock; + new_len = refc->pe_len; + } else + error = xfs_trans_log_finish_refcount_update(tp, cudp, + &dfops, type, refc->pe_startblock, refc->pe_len, + &new_fsb, &new_len, &rcur); + if (error) + goto abort_error; + + /* Requeue what we didn't finish. */ + if (new_len > 0) { + irec.br_startblock = new_fsb; + irec.br_blockcount = new_len; + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_increase_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_decrease_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = xfs_refcount_alloc_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + case XFS_REFCOUNT_FREE_COW: + error = xfs_refcount_free_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + default: + ASSERT(0); + } + if (error) + goto abort_error; + requeue_only = true; + } + } + + xfs_refcount_finish_one_cleanup(tp, rcur, error); + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto abort_error; + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_refcount_finish_one_cleanup(tp, rcur, error); + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h new file mode 100644 index 0000000..5b74ddd --- /dev/null +++ b/fs/xfs/xfs_refcount_item.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_ITEM_H__ +#define __XFS_REFCOUNT_ITEM_H__ + +/* + * There are (currently) two pairs of refcount btree redo item types: + * increase and decrease. The log items for these are CUI (refcount + * update intent) and CUD (refcount update done). The redo item type + * is encoded in the flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same + * transaction that records the associated refcountbt updates. + * + * Should the system crash after the commit of the first transaction + * but before the commit of the final transaction in a series, log + * recovery will use the redo information recorded by the intent items + * to replay the refcountbt metadata updates. + */ + +/* kernel only CUI/CUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_CUI_MAX_FAST_EXTENTS 16 + +/* + * Define CUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_CUI_RECOVERED 1 + +/* + * This is the "refcount update intent" log item. It is used to log + * the fact that some reverse mappings need to change. It is used in + * conjunction with the "refcount update done" log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; + * see the comments about that structure (in xfs_extfree_item.h) for + * more details. + */ +struct xfs_cui_log_item { + struct xfs_log_item cui_item; + atomic_t cui_refcount; + atomic_t cui_next_extent; + unsigned long cui_flags; /* misc flags */ + struct xfs_cui_log_format cui_format; +}; + +static inline size_t +xfs_cui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_cui_log_item, cui_format) + + xfs_cui_log_format_sizeof(nr); +} + +/* + * This is the "refcount update done" log item. It is used to log the + * fact that some refcountbt updates mentioned in an earlier cui item + * have been performed. + */ +struct xfs_cud_log_item { + struct xfs_log_item cud_item; + struct xfs_cui_log_item *cud_cuip; + struct xfs_cud_log_format cud_format; +}; + +extern struct kmem_zone *xfs_cui_zone; +extern struct kmem_zone *xfs_cud_zone; + +struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); +struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, + struct xfs_cui_log_item *); +void xfs_cui_item_free(struct xfs_cui_log_item *); +void xfs_cui_release(struct xfs_cui_log_item *); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); + +#endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 0000000..a279b4e --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,1733 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_reflink.h" +#include "xfs_iomap.h" +#include "xfs_rmap_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" + +/* + * Copy on Write of Shared Blocks + * + * XFS must preserve "the usual" file semantics even when two files share + * the same physical blocks. This means that a write to one file must not + * alter the blocks in a different file; the way that we'll do that is + * through the use of a copy-on-write mechanism. At a high level, that + * means that when we want to write to a shared block, we allocate a new + * block, write the data to the new block, and if that succeeds we map the + * new block into the file. + * + * XFS provides a "delayed allocation" mechanism that defers the allocation + * of disk blocks to dirty-but-not-yet-mapped file blocks as long as + * possible. This reduces fragmentation by enabling the filesystem to ask + * for bigger chunks less often, which is exactly what we want for CoW. + * + * The delalloc mechanism begins when the kernel wants to make a block + * writable (write_begin or page_mkwrite). If the offset is not mapped, we + * create a delalloc mapping, which is a regular in-core extent, but without + * a real startblock. (For delalloc mappings, the startblock encodes both + * a flag that this is a delalloc mapping, and a worst-case estimate of how + * many blocks might be required to put the mapping into the BMBT.) delalloc + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * + * When dirty pages are being written out (typically in writepage), the + * delalloc reservations are converted into real mappings by allocating + * blocks and replacing the delalloc mapping with real ones. A delalloc + * mapping can be replaced by several real ones if the free space is + * fragmented. + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar. The first two steps (creating the reservation + * and allocating the blocks) are exactly the same as delalloc except that + * the mappings must be stored in a separate CoW fork because we do not want + * to disturb the mapping in the data fork until we're sure that the write + * succeeded. IO completion in this case is the process of removing the old + * mapping from the data fork and moving the new mapping from the CoW fork to + * the data fork. This will be discussed shortly. + * + * For now, unaligned directio writes will be bounced back to the page cache. + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written. Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd. For each extent, simply + * unmap the corresponding range in the data fork, map the new range into + * the data fork, and remove the extent from the CoW fork. + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend + * instead of declaring a new IO type. This is required for direct io + * because we only have ioend for the whole dio, and we have to be able to + * remember the presence of unwritten blocks and CoW blocks with a single + * ioend structure. Better yet, the more ground we can cover with one + * ioend, the better. + */ + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is true, return the longest contiguous extent of + * shared blocks. If there are no shared extents, fbno and flen will + * be set to NULLAGBLOCK and 0, respectively. + */ +int +xfs_reflink_find_shared( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, + find_end_of_shared); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_buf_relse(agbp); + return error; +} + +/* + * Trim the mapping to the next block where there's a change in the + * shared/unshared status. More specifically, this means that we + * find the lowest-numbered extent of shared blocks that coincides with + * the given block mapping. If the shared extent overlaps the start of + * the mapping, trim the mapping to the end of the shared extent. If + * the shared region intersects the mapping, trim the mapping to the + * start of the shared extent. If there are no shared regions that + * overlap, just return the original extent. + */ +int +xfs_reflink_trim_around_shared( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + bool *shared, + bool *trimmed) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + /* Holes, unwritten, and delalloc extents cannot be shared */ + if (!xfs_is_reflink_inode(ip) || + ISUNWRITTEN(irec) || + irec->br_startblock == HOLESTARTBLOCK || + irec->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(irec->br_startblock)) { + *shared = false; + return 0; + } + + trace_xfs_reflink_trim_around_shared(ip, irec); + + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + aglen = irec->br_blockcount; + + error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + aglen, &fbno, &flen, true); + if (error) + return error; + + *shared = *trimmed = false; + if (fbno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (fbno == agbno) { + /* + * The start of this extent is shared. Truncate the + * mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the + * unshared region. + */ + irec->br_blockcount = flen; + *shared = true; + if (flen != aglen) + *trimmed = true; + return 0; + } else { + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + *trimmed = true; + return 0; + } +} + +/* + * Trim the passed in imap to the next shared/unshared extent boundary, and + * if imap->br_startoff points to a shared extent reserve space for it in the + * COW fork. In this case *shared is set to true, else to false. + * + * Note that imap will always contain the block numbers for the existing blocks + * in the data fork, as the upper layers need them for read-modify-write + * operations. + */ +int +xfs_reflink_reserve_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared) +{ + struct xfs_bmbt_irec got, prev; + xfs_fileoff_t end_fsb, orig_end_fsb; + int eof = 0, error = 0; + bool trimmed; + xfs_extnum_t idx; + xfs_extlen_t align; + + /* + * Search the COW fork extent list first. This serves two purposes: + * first this implement the speculative preallocation using cowextisze, + * so that we also unshared block adjacent to shared blocks instead + * of just the shared blocks themselves. Second the lookup in the + * extent list is generally faster than going out to the shared extent + * tree. + */ + xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= imap->br_startoff) { + trace_xfs_reflink_cow_found(ip, imap); + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + + *shared = true; + return 0; + } + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + if (error) + return error; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!*shared) + return 0; + + /* + * Fork all the shared blocks from our write offset until the end of + * the extent. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + return error; + + end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; + + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); + if (align) + end_fsb = roundup_64(end_fsb, align); + +retry: + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, + end_fsb - imap->br_startoff, &got, &prev, &idx, eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_reflink_cow_enospc(ip, imap); + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; + goto retry; + } + /*FALLTHRU*/ + default: + return error; + } + + if (end_fsb != orig_end_fsb) + xfs_inode_set_cowblocks_tag(ip); + + trace_xfs_reflink_cow_alloc(ip, &got); + return 0; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +static int +__xfs_reflink_allocate_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + struct xfs_trans *tp; + xfs_fsblock_t first_block; + int nimaps = 1, error; + bool shared; + + xfs_defer_init(&dfops, &first_block); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_trans_cancel; + + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, + XFS_BMAPI_COWFORK, &first_block, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), + &imap, &nimaps, &dfops); + if (error) + goto out_trans_cancel; + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + + *offset_fsb = imap.br_startoff + imap.br_blockcount; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_trans_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* Allocate all CoW reservations covering a part of a file. */ +int +xfs_reflink_allocate_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + int error; + + ASSERT(xfs_is_reflink_inode(ip)); + + trace_xfs_reflink_allocate_cow_range(ip, offset, count); + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + while (offset_fsb < end_fsb) { + error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); + if (error) { + trace_xfs_reflink_allocate_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + + return error; +} + +/* + * Find the CoW reservation (and whether or not it needs block allocation) + * for a given byte offset of a file. + */ +bool +xfs_reflink_find_cow_mapping( + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + bool *need_alloc) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_fileoff_t bno; + xfs_extnum_t idx; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_is_reflink_inode(ip)); + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + bno = XFS_B_TO_FSBT(ip->i_mount, offset); + gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); + if (!gotp) + return false; + + xfs_bmbt_get_all(gotp, &irec); + if (bno >= irec.br_startoff + irec.br_blockcount || + bno < irec.br_startoff) + return false; + + trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, + &irec); + + /* If it's still delalloc, we must allocate later. */ + *imap = irec; + *need_alloc = !!(isnullstartblock(irec.br_startblock)); + + return true; +} + +/* + * Trim an extent to end at the next CoW reservation past offset_fsb. + */ +int +xfs_reflink_trim_irec_to_next_cow( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); + if (!gotp) + return 0; + xfs_bmbt_get_all(gotp, &irec); + + /* This is the extent before; try sliding up one. */ + if (irec.br_startoff < offset_fsb) { + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + return 0; + gotp = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_get_all(gotp, &irec); + } + + if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) + return 0; + + imap->br_blockcount = irec.br_startoff - imap->br_startoff; + trace_xfs_reflink_trim_irec(ip, imap); + + return 0; +} + +/* + * Cancel all pending CoW reservations for some block range of an inode. + */ +int +xfs_reflink_cancel_cow_blocks( + struct xfs_inode *ip, + struct xfs_trans **tpp, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; + xfs_extnum_t idx; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error = 0, eof = 0; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (eof) + return 0; + + while (got.br_startoff < end_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + trace_xfs_reflink_cancel_cow(ip, &del); + + if (isnullstartblock(del.br_startblock)) { + error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, + &idx, &got, &del); + if (error) + break; + } else { + xfs_trans_ijoin(*tpp, ip, 0); + xfs_defer_init(&dfops, &firstfsb); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(ip->i_mount, + &dfops, del.br_startblock, + del.br_blockcount); + if (error) + break; + + xfs_bmap_add_free(ip->i_mount, &dfops, + del.br_startblock, del.br_blockcount, + NULL); + + /* Update quota accounting */ + xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, + -(long)del.br_blockcount); + + /* Roll the transaction */ + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) { + xfs_defer_cancel(&dfops); + break; + } + + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + } + + if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + break; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); + } + + /* clear tag if cow fork is emptied */ + if (!ifp->if_bytes) + xfs_inode_clear_cowblocks_tag(ip); + + return error; +} + +/* + * Cancel all pending CoW reservations for some byte range of an inode. + */ +int +xfs_reflink_cancel_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error; + + trace_xfs_reflink_cancel_cow_range(ip, offset, count); + ASSERT(xfs_is_reflink_inode(ip)); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (count == NULLFILEOFF) + end_fsb = NULLFILEOFF; + else + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + 0, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Scrape out the old CoW reservations */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + if (error) + goto out_cancel; + + error = xfs_trans_commit(tp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error, eof = 0; + unsigned int resblks; + xfs_filblks_t rlen; + xfs_extnum_t idx; + + trace_xfs_reflink_end_cow(ip, offset, count); + + /* No COW extents? That's easy! */ + if (ifp->if_bytes == 0) + return 0; + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, + &got, &prev); + + /* If there is a hole at end_fsb - 1 go to the previous extent */ + if (eof || got.br_startoff > end_fsb) { + ASSERT(idx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + } + + /* Walk backwards until we're out of the I/O range... */ + while (got.br_startoff + got.br_blockcount > offset_fsb) { + del = got; + xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); + + /* Extent delete may have bumped idx forward */ + if (!del.br_blockcount) { + idx--; + goto next_extent; + } + + ASSERT(!isnullstartblock(got.br_startblock)); + + /* Unmap the old blocks in the data fork. */ + xfs_defer_init(&dfops, &firstfsb); + rlen = del.br_blockcount; + error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; + + /* Trim the extent to whatever got unmapped. */ + if (rlen) { + xfs_trim_extent(&del, del.br_startoff + rlen, + del.br_blockcount - rlen); + } + trace_xfs_reflink_cow_remap(ip, &del); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops, + del.br_startblock, del.br_blockcount); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del); + if (error) + goto out_defer; + + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &idx, &got, &del); + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + +next_extent: + if (idx < 0) + break; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Free leftover CoW reservations that didn't get cleaned out. + */ +int +xfs_reflink_recover_cow( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int error = 0; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_refcount_recover_cow_leftovers(mp, agno); + if (error) + break; + } + + return error; +} + +/* + * Reflinking (Block) Ranges of Two Files Together + * + * First, ensure that the reflink flag is set on both inodes. The flag is an + * optimization to avoid unnecessary refcount btree lookups in the write path. + * + * Now we can iteratively remap the range of extents (and holes) in src to the + * corresponding ranges in dest. Let drange and srange denote the ranges of + * logical blocks in dest and src touched by the reflink operation. + * + * While the length of drange is greater than zero, + * - Read src's bmbt at the start of srange ("imap") + * - If imap doesn't exist, make imap appear to start at the end of srange + * with zero length. + * - If imap starts before srange, advance imap to start at srange. + * - If imap goes beyond srange, truncate imap to end at the end of srange. + * - Punch (imap start - srange start + imap len) blocks from dest at + * offset (drange start). + * - If imap points to a real range of pblks, + * > Increase the refcount of the imap's pblks + * > Map imap's pblks into dest at the offset + * (drange start + imap start - srange start) + * - Advance drange and srange by (imap start - srange start + imap len) + * + * Finally, if the reflink made dest longer, update both the in-core and + * on-disk file sizes. + * + * ASCII Art Demonstration: + * + * Let's say we want to reflink this source file: + * + * ----SSSSSSS-SSSSS----SSSSSS (src file) + * <--------------------> + * + * into this destination file: + * + * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) + * <--------------------> + * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. + * Observe that the range has different logical offsets in either file. + * + * Consider that the first extent in the source file doesn't line up with our + * reflink range. Unmapping and remapping are separate operations, so we can + * unmap more blocks from the destination file than we remap. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD---------DDDDD--DDD + * <-------> + * + * Now remap the source extent into the destination file: + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD--SSSSSSSDDDDD--DDD + * <-------> + * + * Do likewise with the second hole and extent in our range. Holes in the + * unmap range don't affect our operation. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <----> + * --DDDDD--SSSSSSS-SSSSS-DDD + * <----> + * + * Finally, unmap and remap part of the third extent. This will increase the + * size of the destination file. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-----> + * --DDDDD--SSSSSSS-SSSSS----SSS + * <-----> + * + * Once we update the destination file's i_size, we're done. + */ + +/* + * Ensure the reflink bit is set in both inodes. + */ +STATIC int +xfs_reflink_set_inode_flag( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + struct xfs_mount *mp = src->i_mount; + int error; + struct xfs_trans *tp; + + if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) + xfs_ilock(src, XFS_ILOCK_EXCL); + else + xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + + if (!xfs_is_reflink_inode(src)) { + trace_xfs_reflink_set_inode_flag(src); + xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); + src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); + xfs_ifork_init_cow(src); + } else + xfs_iunlock(src, XFS_ILOCK_EXCL); + + if (src->i_ino == dest->i_ino) + goto commit_flags; + + if (!xfs_is_reflink_inode(dest)) { + trace_xfs_reflink_set_inode_flag(dest); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + xfs_ifork_init_cow(dest); + } else + xfs_iunlock(dest, XFS_ILOCK_EXCL); + +commit_flags: + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); + return error; +} + +/* + * Update destination inode size & cowextsize hint, if necessary. + */ +STATIC int +xfs_reflink_update_dest( + struct xfs_inode *dest, + xfs_off_t newlen, + xfs_extlen_t cowextsize) +{ + struct xfs_mount *mp = dest->i_mount; + struct xfs_trans *tp; + int error; + + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + if (newlen > i_size_read(VFS_I(dest))) { + trace_xfs_reflink_update_inode_size(dest, newlen); + i_size_write(VFS_I(dest), newlen); + dest->i_d.di_size = newlen; + } + + if (cowextsize) { + dest->i_d.di_cowextsize = cowextsize; + dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + } + + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); + return error; +} + +/* + * Do we have enough reserve in this AG to handle a reflink? The refcount + * btree already reserved all the space it needs, but the rmap btree can grow + * infinitely, so we won't allow more reflinks when the AG is down to the + * btree reserves. + */ +static int +xfs_reflink_ag_has_free_space( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int error = 0; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + pag = xfs_perag_get(mp, agno); + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) + error = -ENOSPC; + xfs_perag_put(pag); + return error; +} + +/* + * Unmap a range of blocks from a file, then map other blocks into the hole. + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). + * The extent irec is mapped into dest at irec->br_startoff. + */ +STATIC int +xfs_reflink_remap_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + xfs_fileoff_t destoff, + xfs_off_t new_isize) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_fsblock_t firstfsb; + unsigned int resblks; + struct xfs_defer_ops dfops; + struct xfs_bmbt_irec uirec; + bool real_extent; + xfs_filblks_t rlen; + xfs_filblks_t unmap_len; + xfs_off_t newlen; + int error; + + unmap_len = irec->br_startoff + irec->br_blockcount - destoff; + trace_xfs_reflink_punch_range(ip, destoff, unmap_len); + + /* Only remap normal extents. */ + real_extent = (irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(irec)); + + /* No reflinking if we're low on space */ + if (real_extent) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + if (error) + goto out; + } + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* If we're not just clearing space, then do we have enough quota? */ + if (real_extent) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + } + + trace_xfs_reflink_remap(ip, irec->br_startoff, + irec->br_blockcount, irec->br_startblock); + + /* Unmap the old blocks in the data fork. */ + rlen = unmap_len; + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec->br_startblock + rlen; + uirec.br_startoff = irec->br_startoff + rlen; + uirec.br_blockcount = unmap_len - rlen; + unmap_len = rlen; + + /* If this isn't a real mapping, we're done. */ + if (!real_extent || uirec.br_blockcount == 0) + goto next_extent; + + trace_xfs_reflink_remap(ip, uirec.br_startoff, + uirec.br_blockcount, uirec.br_startblock); + + /* Update the refcount tree */ + error = xfs_refcount_increase_extent(mp, &dfops, &uirec); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); + if (error) + goto out_defer; + + /* Update quota accounting. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + uirec.br_blockcount); + + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, + uirec.br_startoff + uirec.br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + +next_extent: + /* Process all the deferred stuff. */ + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + return error; +} + +/* + * Iteratively remap one file's extents (and holes) to another's. + */ +STATIC int +xfs_reflink_remap_blocks( + struct xfs_inode *src, + xfs_fileoff_t srcoff, + struct xfs_inode *dest, + xfs_fileoff_t destoff, + xfs_filblks_t len, + xfs_off_t new_isize) +{ + struct xfs_bmbt_irec imap; + int nimaps; + int error = 0; + xfs_filblks_t range_len; + + /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ + while (len) { + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, + dest, destoff); + /* Read extent from the source file */ + nimaps = 1; + xfs_ilock(src, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); + xfs_iunlock(src, XFS_ILOCK_EXCL); + if (error) + goto err; + ASSERT(nimaps == 1); + + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + &imap); + + /* Translate imap into the destination file. */ + range_len = imap.br_startoff + imap.br_blockcount - srcoff; + imap.br_startoff += destoff - srcoff; + + /* Clear dest from destoff to the end of imap and map it in. */ + error = xfs_reflink_remap_extent(dest, &imap, destoff, + new_isize); + if (error) + goto err; + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto err; + } + + /* Advance drange/srange */ + srcoff += range_len; + destoff += range_len; + len -= range_len; + } + + return 0; + +err: + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + return error; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page * +xfs_get_page( + struct inode *inode, + xfs_off_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int +xfs_compare_extents( + struct inode *src, + xfs_off_t srcoff, + struct inode *dest, + xfs_off_t destoff, + xfs_off_t len, + bool *is_same) +{ + xfs_off_t src_poff; + xfs_off_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + xfs_off_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + ASSERT(cmp_len > 0); + + trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, + XFS_I(dest), destoff); + + src_page = xfs_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = xfs_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); + return error; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t bs = inode_out->i_sb->s_blocksize; + bool same_inode = (inode_in == inode_out); + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + xfs_extlen_t cowextsize; + loff_t isize; + ssize_t ret; + loff_t blen; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Lock both files against IO */ + if (same_inode) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* Don't touch certain kinds of inodes */ + ret = -EPERM; + if (IS_IMMUTABLE(inode_out)) + goto out_unlock; + + ret = -ETXTBSY; + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + goto out_unlock; + + + /* Don't reflink dirs, pipes, sockets... */ + ret = -EISDIR; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + goto out_unlock; + ret = -EINVAL; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + goto out_unlock; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + goto out_unlock; + + /* Don't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + goto out_unlock; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + goto out_unlock; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + ret = 0; + goto out_unlock; + } + + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + goto out_unlock; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + goto out_unlock; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + goto out_unlock; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + goto out_unlock; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + len - 1); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len - 1); + if (ret) + goto out_unlock; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + /* + * Check that the extents are the same. + */ + if (is_dedupe) { + bool is_same = false; + + ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, + len, &is_same); + if (ret) + goto out_unlock; + if (!is_same) { + ret = -EBADE; + goto out_unlock; + } + } + + ret = xfs_reflink_set_inode_flag(src, dest); + if (ret) + goto out_unlock; + + /* + * Invalidate the page cache so that we can clear any CoW mappings + * in the destination file. + */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + + dfsbno = XFS_B_TO_FSBT(mp, pos_out); + sfsbno = XFS_B_TO_FSBT(mp, pos_in); + fsblen = XFS_B_TO_FSB(mp, len); + ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + pos_out + len); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); + +out_unlock: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return ret; +} + +/* + * The user wants to preemptively CoW all shared blocks in this file, + * which enables us to turn off the reflink flag. Iterate all + * extents which are not prealloc/delalloc to see which ranges are + * mentioned in the refcount tree, then read those blocks into the + * pagecache, dirty them, fsync them back out, and then we can update + * the inode flag. What happens if we run out of memory? :) + */ +STATIC int +xfs_reflink_dirty_extents( + struct xfs_inode *ip, + xfs_fileoff_t fbno, + xfs_filblks_t end, + xfs_off_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_off_t fpos; + xfs_off_t flen; + struct xfs_bmbt_irec map[2]; + int nmaps; + int error = 0; + + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); + if (error) + goto out; + if (nmaps == 0) + break; + if (map[0].br_startblock == HOLESTARTBLOCK || + map[0].br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map[0])) + goto next; + + map[1] = map[0]; + while (map[1].br_blockcount) { + agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); + aglen = map[1].br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, true); + if (error) + goto out; + if (rbno == NULLAGBLOCK) + break; + + /* Dirty the pages */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + + (rbno - agbno)); + flen = XFS_FSB_TO_B(mp, rlen); + if (fpos + flen > isize) + flen = isize - fpos; + error = iomap_file_dirty(VFS_I(ip), fpos, flen, + &xfs_iomap_ops); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + + map[1].br_blockcount -= (rbno - agbno + rlen); + map[1].br_startoff += (rbno - agbno + rlen); + map[1].br_startblock += (rbno - agbno + rlen); + } + +next: + fbno = map[0].br_startoff + map[0].br_blockcount; + } +out: + return error; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + struct xfs_bmbt_irec map; + int nmaps; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + fbno = 0; + end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + if (error) + return error; + if (nmaps == 0) + break; + if (map.br_startblock == HOLESTARTBLOCK || + map.br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map)) + goto next; + + agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); + aglen = map.br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, false); + if (error) + return error; + /* Is there still a shared block here? */ + if (rbno != NULLAGBLOCK) + return 0; +next: + fbno = map.br_startoff + map.br_blockcount; + } + + /* + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + if (error) + return error; + + /* Clear the inode flag. */ + trace_xfs_reflink_unset_inode_flag(ip); + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + xfs_trans_ijoin(*tpp, ip, 0); + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + return error; +} + +/* + * Clear the inode reflink flag if there are no shared extents and the size + * hasn't changed. + */ +STATIC int +xfs_reflink_try_clear_inode_flag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error = 0; + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_clear_inode_flag(ip, &tp); + if (error) + goto cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +cancel: + xfs_trans_cancel(tp); +out: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Pre-COW all shared blocks within a given byte range of a file and turn off + * the reflink flag if we unshare all of the file's blocks. + */ +int +xfs_reflink_unshare( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_off_t isize; + int error; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + trace_xfs_reflink_unshare(ip, offset, len); + + inode_dio_wait(VFS_I(ip)); + + /* Try to CoW the selected ranges */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + fbno = XFS_B_TO_FSBT(mp, offset); + isize = i_size_read(VFS_I(ip)); + end = XFS_B_TO_FSB(mp, offset + len); + error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* Wait for the IO to finish */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out; + + /* Turn off the reflink flag if possible. */ + error = xfs_reflink_try_clear_inode_flag(ip); + if (error) + goto out; + + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); + return error; +} + +/* + * Does this inode have any real CoW reservations? + */ +bool +xfs_reflink_has_real_cow_blocks( + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return false; + + /* Go find the old extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); + while (gotp) { + xfs_bmbt_get_all(gotp, &irec); + + if (!isnullstartblock(irec.br_startblock)) + return true; + + /* Roll on... */ + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + break; + gotp = xfs_iext_get_ext(ifp, idx); + } + + return false; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 0000000..fad1160 --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + +extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, bool *shared); +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, + struct xfs_bmbt_irec *imap, bool *need_alloc); +extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); + +extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, + struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb); +extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, + struct xfs_trans **tpp); +extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + +extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); + +#endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2500f28..73c8278 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -51,28 +51,16 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } -/* - * This returns the number of iovecs needed to log the given rui item. - * We only need 1 iovec for an rui item. It just logs the rui_log_format - * structure. - */ -static inline int -xfs_rui_item_sizeof( - struct xfs_rui_log_item *ruip) -{ - return sizeof(struct xfs_rui_log_format) + - (ruip->rui_format.rui_nextents - 1) * - sizeof(struct xfs_map_extent); -} - STATIC void xfs_rui_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); + *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } /* @@ -97,7 +85,7 @@ xfs_rui_item_format( ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, - xfs_rui_item_sizeof(ruip)); + xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents)); } /* @@ -205,16 +193,12 @@ xfs_rui_init( { struct xfs_rui_log_item *ruip; - uint size; ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_rui_log_item) + - ((nextents - 1) * sizeof(struct xfs_map_extent))); - ruip = kmem_zalloc(size, KM_SLEEP); - } else { + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + else ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); - } xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -239,14 +223,12 @@ xfs_rui_copy_format( uint len; src_rui_fmt = buf->i_addr; - len = sizeof(struct xfs_rui_log_format) + - (src_rui_fmt->rui_nextents - 1) * - sizeof(struct xfs_map_extent); + len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); if (buf->i_len != len) return -EFSCORRUPTED; - memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; } @@ -459,8 +441,11 @@ xfs_rui_recover( XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_UNMAP_SHARED: case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_CONVERT_SHARED: case XFS_RMAP_EXTENT_ALLOC: case XFS_RMAP_EXTENT_FREE: op_ok = true; @@ -499,12 +484,21 @@ xfs_rui_recover( case XFS_RMAP_EXTENT_MAP: type = XFS_RMAP_MAP; break; + case XFS_RMAP_EXTENT_MAP_SHARED: + type = XFS_RMAP_MAP_SHARED; + break; case XFS_RMAP_EXTENT_UNMAP: type = XFS_RMAP_UNMAP; break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + type = XFS_RMAP_UNMAP_SHARED; + break; case XFS_RMAP_EXTENT_CONVERT: type = XFS_RMAP_CONVERT; break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + type = XFS_RMAP_CONVERT_SHARED; + break; case XFS_RMAP_EXTENT_ALLOC: type = XFS_RMAP_ALLOC; break; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index aefcc3a..340c968 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -70,6 +70,14 @@ struct xfs_rui_log_item { struct xfs_rui_log_format rui_format; }; +static inline size_t +xfs_rui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_rui_log_item, rui_format) + + xfs_rui_log_format_sizeof(nr); +} + /* * This is the "rmap update done" log item. It is used to log the fact that * some rmapbt updates mentioned in an earlier rui item have been performed. diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 6e812fe0..12d48cd 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, { "rmapbt", XFSSTAT_END_RMAP_V2 }, + { "refcntbt", XFSSTAT_END_REFCOUNT }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 657865f..79ad2e6 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -213,7 +213,23 @@ struct xfsstats { __uint32_t xs_rmap_2_alloc; __uint32_t xs_rmap_2_free; __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) + __uint32_t xs_refcbt_2_lookup; + __uint32_t xs_refcbt_2_compare; + __uint32_t xs_refcbt_2_insrec; + __uint32_t xs_refcbt_2_delrec; + __uint32_t xs_refcbt_2_newroot; + __uint32_t xs_refcbt_2_killroot; + __uint32_t xs_refcbt_2_increment; + __uint32_t xs_refcbt_2_decrement; + __uint32_t xs_refcbt_2_lshift; + __uint32_t xs_refcbt_2_rshift; + __uint32_t xs_refcbt_2_split; + __uint32_t xs_refcbt_2_join; + __uint32_t xs_refcbt_2_alloc; + __uint32_t xs_refcbt_2_free; + __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fd6be45..ade4691 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,9 @@ #include "xfs_sysfs.h" #include "xfs_ondisk.h" #include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" +#include "xfs_reflink.h" #include <linux/namei.h> #include <linux/init.h> @@ -936,6 +939,7 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); + int error; trace_xfs_destroy_inode(ip); @@ -943,6 +947,14 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) + xfs_warn(ip->i_mount, +"Error %d while evicting CoW blocks for inode %llu.", + error, ip->i_ino); + } + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1006,6 +1018,16 @@ xfs_fs_drop_inode( { struct xfs_inode *ip = XFS_I(inode); + /* + * If this unlinked inode is in the middle of recovery, don't + * drop the inode just yet; log recovery will take care of + * that. See the comment for this inode flag. + */ + if (ip->i_flags & XFS_IRECOVERY) { + ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + return 0; + } + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } @@ -1137,7 +1159,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) * Note: xfs_log_quiesce() stops background log work - the callers must ensure * it is started again when appropriate. */ -static void +void xfs_quiesce_attr( struct xfs_mount *mp) { @@ -1296,10 +1318,31 @@ xfs_fs_remount( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_queue_eofblocks(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* * Before we sync the metadata, we need to free up the reserve * block pool so that the used block count in the superblock on @@ -1490,6 +1533,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; @@ -1572,6 +1616,9 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "DAX and reflink have not been tested together!"); } if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { @@ -1585,6 +1632,10 @@ xfs_fs_fill_super( "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1782,15 +1833,44 @@ xfs_init_zones(void) if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + - ((XFS_RUI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_map_extent))), + xfs_rui_zone = kmem_zone_init( + xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), "xfs_rui_item"); if (!xfs_rui_zone) goto out_destroy_rud_zone; + xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), + "xfs_cud_item"); + if (!xfs_cud_zone) + goto out_destroy_rui_zone; + + xfs_cui_zone = kmem_zone_init( + xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), + "xfs_cui_item"); + if (!xfs_cui_zone) + goto out_destroy_cud_zone; + + xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), + "xfs_bud_item"); + if (!xfs_bud_zone) + goto out_destroy_cui_zone; + + xfs_bui_zone = kmem_zone_init( + xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), + "xfs_bui_item"); + if (!xfs_bui_zone) + goto out_destroy_bud_zone; + return 0; + out_destroy_bud_zone: + kmem_zone_destroy(xfs_bud_zone); + out_destroy_cui_zone: + kmem_zone_destroy(xfs_cui_zone); + out_destroy_cud_zone: + kmem_zone_destroy(xfs_cud_zone); + out_destroy_rui_zone: + kmem_zone_destroy(xfs_rui_zone); out_destroy_rud_zone: kmem_zone_destroy(xfs_rud_zone); out_destroy_icreate_zone: @@ -1833,6 +1913,10 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_bui_zone); + kmem_zone_destroy(xfs_bud_zone); + kmem_zone_destroy(xfs_cui_zone); + kmem_zone_destroy(xfs_cud_zone); kmem_zone_destroy(xfs_rui_zone); kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); @@ -1886,6 +1970,8 @@ init_xfs_fs(void) xfs_extent_free_init_defer_op(); xfs_rmap_update_init_defer_op(); + xfs_refcount_update_init_defer_op(); + xfs_bmap_update_init_defer_op(); xfs_dir_startup(); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 529bce9..b6418ab 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,6 +61,7 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; +extern void xfs_quiesce_attr(struct xfs_mount *mp); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index aed74d3..afe1f66 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.eofb_timer.min, .extra2 = &xfs_params.eofb_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.cowb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.cowb_timer.min, + .extra2 = &xfs_params.cowb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ffef453..984a349 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -48,6 +48,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ + xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 79cfd3f..276d302 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -393,9 +393,15 @@ max_retries_show( struct kobject *kobject, char *buf) { + int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + retries = -1; + else + retries = cfg->max_retries; + + return snprintf(buf, PAGE_SIZE, "%d\n", retries); } static ssize_t @@ -415,7 +421,10 @@ max_retries_store( if (val < -1) return -EINVAL; - cfg->max_retries = val; + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); @@ -425,10 +434,15 @@ retry_timeout_seconds_show( struct kobject *kobject, char *buf) { + int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%ld\n", - jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + timeout = -1; + else + timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; + + return snprintf(buf, PAGE_SIZE, "%d\n", timeout); } static ssize_t @@ -445,11 +459,16 @@ retry_timeout_seconds_store( if (ret) return ret; - /* 1 day timeout maximum */ - if (val < 0 || val > 86400) + /* 1 day timeout maximum, -1 means infinite */ + if (val < -1 || val > 86400) return -EINVAL; - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else { + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); @@ -493,13 +512,13 @@ static struct attribute *xfs_error_attrs[] = { }; -struct kobj_type xfs_error_cfg_ktype = { +static struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_error_attrs, }; -struct kobj_type xfs_error_ktype = { +static struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; @@ -519,18 +538,19 @@ struct xfs_error_init { static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", - .max_retries = 0, + .max_retries = 0, /* We can't recover from devices disappearing */ + .retry_timeout = 0, }, }; @@ -561,7 +581,10 @@ xfs_error_sysfs_init_class( goto out_error; cfg->max_retries = init[i].max_retries; - cfg->retry_timeout = msecs_to_jiffies( + if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->retry_timeout = msecs_to_jiffies( init[i].retry_timeout * MSEC_PER_SEC); } return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d303a66..0907752 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -39,6 +39,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_refcount_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? - ip->i_afp : &ip->i_df; + struct xfs_ifork *ifp; struct xfs_bmbt_irec r; + ifp = xfs_iext_state_to_fork(ip, state); xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; @@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); @@ -1170,7 +1176,6 @@ DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); -DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, TP_PROTO(struct inode *inode, struct page *page, unsigned long off, @@ -1570,14 +1575,15 @@ TRACE_EVENT(xfs_agf, TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, bool isfl, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, + int haveright), + TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, isfl) + __field(int, resv) __field(int, haveleft) __field(int, haveright) ), @@ -1586,16 +1592,16 @@ TRACE_EVENT(xfs_free_extent, __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->isfl = isfl; + __entry->resv = resv; __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __entry->isfl, + __entry->resv, __entry->haveleft ? (__entry->haveright ? "both" : "left") : (__entry->haveright ? "right" : "none")) @@ -1622,8 +1628,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) - __field(char, isfl) - __field(char, userdata) + __field(int, resv) + __field(int, datatype) __field(xfs_fsblock_t, firstblock) ), TP_fast_assign( @@ -1643,14 +1649,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; - __entry->isfl = args->isfl; - __entry->userdata = args->userdata; + __entry->resv = args->resv; + __entry->datatype = args->datatype; __entry->firstblock = args->firstblock; ), TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " - "userdata %d firstblock 0x%llx", + "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " + "datatype 0x%x firstblock 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1667,8 +1673,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, - __entry->isfl, - __entry->userdata, + __entry->resv, + __entry->datatype, (unsigned long long)__entry->firstblock) ) @@ -1984,6 +1990,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover_record, + TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), + TP_ARGS(log, rhead, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + __field(int, len) + __field(int, num_logops) + __field(int, pass) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->lsn = be64_to_cpu(rhead->h_lsn); + __entry->len = be32_to_cpu(rhead->h_len); + __entry->num_logops = be32_to_cpu(rhead->h_num_logops); + __entry->pass = pass; + ), + TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn, __entry->len, __entry->num_logops, + __entry->pass) +) + DECLARE_EVENT_CLASS(xfs_log_recover_item_class, TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), @@ -1992,6 +2021,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __field(dev_t, dev) __field(unsigned long, item) __field(xlog_tid_t, tid) + __field(xfs_lsn_t, lsn) __field(int, type) __field(int, pass) __field(int, count) @@ -2001,15 +2031,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __entry->dev = log->l_mp->m_super->s_dev; __entry->item = (unsigned long)item; __entry->tid = trans->r_log_tid; + __entry->lsn = trans->r_lsn; __entry->type = ITEM_TYPE(item); __entry->pass = pass; __entry->count = item->ri_cnt; __entry->total = item->ri_total; ), - TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " - "item region count/total %d/%d", + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " + "item type %s item region count/total %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->lsn, __entry->pass, (void *)__entry->item, __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), @@ -2068,6 +2100,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); @@ -2554,10 +2587,794 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete); DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); + +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range); DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* deferred bmbt updates */ +#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); + +/* per-AG reservation */ +DECLARE_EVENT_CLASS(xfs_ag_resv_class, + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, + xfs_extlen_t len), + TP_ARGS(pag, resv, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, resv) + __field(xfs_extlen_t, freeblks) + __field(xfs_extlen_t, flcount) + __field(xfs_extlen_t, reserved) + __field(xfs_extlen_t, asked) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); + + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->resv = resv; + __entry->freeblks = pag->pagf_freeblks; + __entry->flcount = pag->pagf_flcount; + __entry->reserved = r ? r->ar_reserved : 0; + __entry->asked = r ? r->ar_asked : 0; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->resv, + __entry->freeblks, + __entry->flcount, + __entry->reserved, + __entry->asked, + __entry->len) +) +#define DEFINE_AG_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_ag_resv_class, name, \ + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \ + xfs_extlen_t len), \ + TP_ARGS(pag, type, len)) + +/* per-AG reservation tracepoints */ +DEFINE_AG_RESV_EVENT(xfs_ag_resv_init); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); + +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); + +/* refcount tracepoint classes */ + +/* reuse the discard trace class for agbno/aglen-based traces */ +#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) + +/* ag btree lookup tracepoint class */ +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQ, "eq" }, \ + { XFS_LOOKUP_LE, "le" }, \ + { XFS_LOOKUP_GE, "ge" } +DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_lookup_t dir), + TP_ARGS(mp, agno, agbno, dir), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_lookup_t, dir) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->dir = dir; + ), + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), + __entry->dir) +) + +#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ +DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_lookup_t dir), \ + TP_ARGS(mp, agno, agbno, dir)) + +/* single-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec), \ + TP_ARGS(mp, agno, irec)) + +/* single-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), + TP_ARGS(mp, agno, irec, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, irec, agbno)) + +/* double-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), + TP_ARGS(mp, agno, i1, i2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ + TP_ARGS(mp, agno, i1, i2)) + +/* double-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + xfs_agblock_t agbno), + TP_ARGS(mp, agno, i1, i2, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, i1, i2, agbno)) + +/* triple-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + struct xfs_refcount_irec *i3), + TP_ARGS(mp, agno, i1, i2, i3), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, i3_startblock) + __field(xfs_extlen_t, i3_blockcount) + __field(xfs_nlink_t, i3_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->i3_startblock = i3->rc_startblock; + __entry->i3_blockcount = i3->rc_blockcount; + __entry->i3_refcount = i3->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->i3_startblock, + __entry->i3_blockcount, + __entry->i3_refcount) +); + +#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + struct xfs_refcount_irec *i3), \ + TP_ARGS(mp, agno, i1, i2, i3)) + +/* refcount btree tracepoints */ +DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); +DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); +DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); + +/* refcount adjustment tracepoints */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); +DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); + +/* reflink helpers */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + +TRACE_EVENT(xfs_refcount_finish_one_leftover, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t new_agbno, xfs_extlen_t new_len), + TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, new_agbno) + __field(xfs_extlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + __entry->new_agbno = new_agbno; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len, + __entry->new_agbno, + __entry->new_len) +); + +/* simple inode-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_inode_error_class, + TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip), + TP_ARGS(ip, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino %llx error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_INODE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_inode_error_class, name, \ + TP_PROTO(struct xfs_inode *ip, int error, \ + unsigned long caller_ip), \ + TP_ARGS(ip, error, caller_ip)) + +/* reflink allocator */ +TRACE_EVENT(xfs_bmap_remap_alloc, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, + xfs_extlen_t len), + TP_ARGS(ip, fsbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, fsbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->fsbno = fsbno; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->fsbno, + __entry->len) +); +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); + +/* reflink tracepoint classes */ + +/* two-file io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_io_class, + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, + struct xfs_inode *dest, xfs_off_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_disize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(xfs_ino_t, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_disize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = VFS_I(src)->i_size; + __entry->src_disize = src->i_d.di_size; + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = VFS_I(dest)->i_size; + __entry->dest_disize = dest->i_d.di_size; + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_disize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_disize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_io_class, name, \ + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \ + struct xfs_inode *dest, xfs_off_t doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* two-file vfs io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, + TP_PROTO(struct inode *src, u64 soffset, u64 len, + struct inode *dest, u64 doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%lx isize 0x%llx offset 0x%llx -> " + "ino 0x%lx isize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_vfs_io_class, name, \ + TP_PROTO(struct inode *src, u64 soffset, u64 len, \ + struct inode *dest, u64 doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* CoW write tracepoint */ +DECLARE_EVENT_CLASS(xfs_copy_on_write_class, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, pblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_fsblock_t, pblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " + "len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->pblk, + __entry->len, + __entry->new_pblk) +) + +#define DEFINE_COW_EVENT(name) \ +DEFINE_EVENT(xfs_copy_on_write_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ + xfs_extlen_t len, xfs_fsblock_t new_pblk), \ + TP_ARGS(ip, lblk, pblk, len, new_pblk)) + +/* inode/irec events */ +DECLARE_EVENT_CLASS(xfs_inode_irec_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->pblk) +); +#define DEFINE_INODE_IREC_EVENT(name) \ +DEFINE_EVENT(xfs_inode_irec_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, irec)) + +/* refcount/reflink tracepoint definitions */ + +/* reflink tracepoints */ +DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); +DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); +DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +TRACE_EVENT(xfs_reflink_remap_blocks_loop, + TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, + xfs_filblks_t len, struct xfs_inode *dest, + xfs_fileoff_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(xfs_fileoff_t, src_lblk) + __field(xfs_filblks_t, len) + __field(xfs_ino_t, dest_ino) + __field(xfs_fileoff_t, dest_lblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_lblk = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_lblk = doffset; + ), + TP_printk("dev %d:%d len 0x%llx " + "ino 0x%llx offset 0x%llx blocks -> " + "ino 0x%llx offset 0x%llx blocks", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_lblk, + __entry->dest_ino, + __entry->dest_lblk) +); +TRACE_EVENT(xfs_reflink_punch_range, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len), + TP_ARGS(ip, lblk, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len) +); +TRACE_EVENT(xfs_reflink_remap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->new_pblk) +); +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); + +/* dedupe tracepoints */ +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); + +/* ioctl tracepoints */ +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); +TRACE_EVENT(xfs_ioctl_clone, + TP_PROTO(struct inode *src, struct inode *dest), + TP_ARGS(src, dest), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + ), + TP_printk("dev %d:%d " + "ino 0x%lx isize 0x%llx -> " + "ino 0x%lx isize 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_ino, + __entry->src_isize, + __entry->dest_ino, + __entry->dest_isize) +); + +/* unshare tracepoints */ +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); +DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); + +/* copy on write */ +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); + +DEFINE_RW_EVENT(xfs_reflink_reserve_cow); +DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); + +DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); + +DEFINE_COW_EVENT(xfs_reflink_fork_buf); +DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); + +DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); + +/* rmap swapext tracepoints */ +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); +DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5f3d33d..70f42ea 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -217,7 +217,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -318,7 +318,6 @@ xfs_trans_mod_sb( * in-core superblock's counter. This should only * be applied to the on-disk superblock. */ - ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) flags &= ~XFS_TRANS_SB_DIRTY; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2bf86a..61b7fbd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -36,6 +36,11 @@ struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; +struct xfs_cui_log_item; +struct xfs_cud_log_item; +struct xfs_defer_ops; +struct xfs_bui_log_item; +struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +/* refcount updates */ +enum xfs_refcount_intent_type; + +void xfs_refcount_update_init_defer_op(void); +struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, + struct xfs_cui_log_item *cuip); +int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, + xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + +/* mapping updates */ +enum xfs_bmap_intent_type; + +void xfs_bmap_update_init_defer_op(void); +struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, + struct xfs_bui_log_item *buip); +int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, + struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, struct xfs_inode *ip, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c new file mode 100644 index 0000000..6408e7d --- /dev/null +++ b/fs/xfs/xfs_trans_bmap.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * This routine is called to allocate a "bmap update done" + * log item. + */ +struct xfs_bud_log_item * +xfs_trans_get_bud( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip) +{ + struct xfs_bud_log_item *budp; + + budp = xfs_bud_init(tp->t_mountp, buip); + xfs_trans_add_item(tp, &budp->bud_item); + return budp; +} + +/* + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. + */ +int +xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, + struct xfs_defer_ops *dop, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int error; + + error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff, + startblock, blockcount, state); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the BUI and frees the BUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_bmap_intent *ba; + struct xfs_bmap_intent *bb; + + ba = container_of(a, struct xfs_bmap_intent, bi_list); + bb = container_of(b, struct xfs_bmap_intent, bi_list); + return ba->bi_owner->i_ino - bb->bi_owner->i_ino; +} + +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_bui_log_item *buip; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + ASSERT(tp != NULL); + + buip = xfs_bui_init(tp->t_mountp); + ASSERT(buip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &buip->bui_item); + return buip; +} + +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( + struct xfs_map_extent *bmap, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + bmap->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + bmap->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) + bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; +} + +/* Log bmap updates in the intent item. */ +STATIC void +xfs_bmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_bui_log_item *buip = intent; + struct xfs_bmap_intent *bmap; + uint next_extent; + struct xfs_map_extent *map; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; + map->me_owner = bmap->bi_owner->i_ino; + map->me_startblock = bmap->bi_bmap.br_startblock; + map->me_startoff = bmap->bi_bmap.br_startoff; + map->me_len = bmap->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, + bmap->bi_bmap.br_state); +} + +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_bud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_bmap_intent *bmap; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, + bmap->bi_bmap.br_blockcount, + bmap->bi_bmap.br_state); + kmem_free(bmap); + return error; +} + +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( + void *intent) +{ + xfs_bui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bmap; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_free(bmap); +} + +static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_BMAP, + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .diff_items = xfs_bmap_update_diff_items, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .log_item = xfs_bmap_update_log_item, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_bmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_bmap_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 459ddec..ab43864 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -79,7 +79,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo); + error = xfs_free_extent(tp, start_block, ext_len, oinfo, + XFS_AG_RESV_NONE); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 11a3af0..dab8daa 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -73,7 +73,7 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - tv = current_fs_time(inode->i_sb); + tv = current_time(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c new file mode 100644 index 0000000..94c1877 --- /dev/null +++ b/fs/xfs/xfs_trans_refcount.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_refcount_item.h" +#include "xfs_alloc.h" +#include "xfs_refcount.h" + +/* + * This routine is called to allocate a "refcount update done" + * log item. + */ +struct xfs_cud_log_item * +xfs_trans_get_cud( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip) +{ + struct xfs_cud_log_item *cudp; + + cudp = xfs_cud_init(tp->t_mountp, cuip); + xfs_trans_add_item(tp, &cudp->cud_item); + return cudp; +} + +/* + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. + */ +int +xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, + struct xfs_defer_ops *dop, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_refcount_finish_one(tp, dop, type, startblock, + blockcount, new_fsb, new_len, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the CUI and frees the CUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_refcount_intent *ra; + struct xfs_refcount_intent *rb; + + ra = container_of(a, struct xfs_refcount_intent, ri_list); + rb = container_of(b, struct xfs_refcount_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} + +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_cui_log_item *cuip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + cuip = xfs_cui_init(tp->t_mountp, count); + ASSERT(cuip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &cuip->cui_item); + return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( + struct xfs_phys_extent *refc, + enum xfs_refcount_intent_type type) +{ + refc->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + refc->pe_flags |= type; + break; + default: + ASSERT(0); + } +} + +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_cui_log_item *cuip = intent; + struct xfs_refcount_intent *refc; + uint next_extent; + struct xfs_phys_extent *ext; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); + ext = &cuip->cui_format.cui_extents[next_extent]; + ext->pe_startblock = refc->ri_startblock; + ext->pe_len = refc->ri_blockcount; + xfs_trans_set_refcount_flags(ext, refc->ri_type); +} + +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_cud(tp, intent); +} + +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_refcount_intent *refc; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_aglen; + int error; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, done_item, dop, + refc->ri_type, + refc->ri_startblock, + refc->ri_blockcount, + &new_fsb, &new_aglen, + (struct xfs_btree_cur **)state); + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && new_aglen > 0) { + ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || + refc->ri_type == XFS_REFCOUNT_DECREASE); + refc->ri_startblock = new_fsb; + refc->ri_blockcount = new_aglen; + return -EAGAIN; + } + kmem_free(refc); + return error; +} + +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_refcount_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( + void *intent) +{ + xfs_cui_release(intent); +} + +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *refc; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_free(refc); +} + +static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_REFCOUNT, + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .diff_items = xfs_refcount_update_diff_items, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .log_item = xfs_refcount_update_log_item, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_update_finish_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_refcount_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_refcount_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 5a50ef8..9ead064 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags( case XFS_RMAP_MAP: rmap->me_flags |= XFS_RMAP_EXTENT_MAP; break; + case XFS_RMAP_MAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; case XFS_RMAP_UNMAP: rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; + case XFS_RMAP_UNMAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; case XFS_RMAP_CONVERT: rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; + case XFS_RMAP_CONVERT_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; case XFS_RMAP_ALLOC: rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ea62245..6290093 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -147,6 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ + context->seen_enough = 1; return 0; } offset = (char *)context->alist + context->count; |