diff options
Diffstat (limited to 'fs')
44 files changed, 3318 insertions, 2867 deletions
diff --git a/fs/Makefile b/fs/Makefile index 4fe6df3..39a824f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index aa33976..2c29db6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); -extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid); +extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + const unsigned char *path); extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 124aa02..d707edb 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4010,7 +4010,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4179,7 +4179,7 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4263,7 +4263,7 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 11ff5f1..a514e0a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -193,7 +193,7 @@ check_name(struct dentry *direntry) static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid, int *created) + __u32 *oplock, struct cifs_fid *fid) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, .device = 0, }; - *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) @@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, opened); + &oplock, &fid); if (rc) { cifs_del_pending_open(&open); goto out; } + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + rc = finish_open(file, direntry, generic_file_open, opened); if (rc) { if (server->ops->close) @@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; - int created = FILE_CREATED; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", inode, direntry->d_name.name, direntry); @@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, server->ops->new_lease_key(&fid); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, &created); + &oplock, &fid); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 36f9ebb..49719b8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } @@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index cc023471..92aee08 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) { - int rc = 0; + int rc; u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; - struct cifs_tcon *ptcon; if (!CIFSCouldBeMFSymlink(fattr)) /* it's not a symlink */ return 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; - ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); - if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) - rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, - &bytes_read, cifs_sb, xid); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(path, buf, + &bytes_read, cifs_sb, xid); else - goto out; + rc = -ENOSYS; - if (rc != 0) + if (rc) goto out; if (bytes_read == 0) /* not a symlink */ diff --git a/fs/dcache.c b/fs/dcache.c index 6055d61..cb4a106 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b5e258..af90312 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1907,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } } } - if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) { - tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); - } /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2885349..20d6697 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; err = ext2_get_block(inode, blk, &tmp_bh, 1); if (err < 0) goto out; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e618503..ece5556 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -268,6 +268,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 17ac112..3fe29de 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35f65cf..3384dc4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * extent, we have to mark the cluster as used (store negative * cluster number in partial_cluster). */ - unaligned = pblk & (sbi->s_cluster_ratio - 1); + unaligned = EXT4_PBLK_COFF(sbi, pblk); if (unaligned && (ee_len == num) && (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); @@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * accidentally freeing it later on */ pblk = ext4_ext_pblock(ex); - if (pblk & (sbi->s_cluster_ratio - 1)) + if (EXT4_PBLK_COFF(sbi, pblk)) *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); ex--; @@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0757634..61d49ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1236,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1249,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1289,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4d113ef..04a5c75 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c977f4e..1f7784d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb) } ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb) } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting @@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -4071,10 +4080,10 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sbi)); + "reserved pool", ext4_calculate_resv_clusters(sb)); goto failed_mount4a; } @@ -4184,7 +4193,7 @@ failed_mount_wq: } failed_mount3: ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f4a10e..e0259a1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean. We don't want to mess with writeback - * lists in this function since flusher thread may be doing for example - * sync in parallel and if we move the inode, it could get skipped. So - * here we make sure inode is on some writeback list and leave it there - * unless we have completely cleaned the inode. + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b7fc035..73f3e4e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1006,6 +1007,35 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_length(iov, nr_segs); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + return rv; + truncate_inode_pages_range(mapping, lstart, end); + } + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c8420f7..6f7a47c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1655,6 +1655,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) struct task_struct *gh_owner = NULL; char flags_buf[32]; + rcu_read_lock(); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", @@ -1664,6 +1665,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); + rcu_read_unlock(); return 0; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index db908f6..f88dcd9 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -192,8 +192,11 @@ static void inode_go_sync(struct gfs2_glock *gl) if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; @@ -410,6 +413,9 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && (gh->gh_state == LM_ST_EXCLUSIVE)) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 610613f..9dcb977 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9324150..52f177b 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -258,6 +258,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); @@ -273,12 +274,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int tr->tr_num_databuf_rm++; } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } spin_unlock(&sdp->sd_ail_lock); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 82303b4..52fa883 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1366,8 +1366,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5203264..5fa344a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } read_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD: Unknown checksum type\n"); + printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } @@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; goto out; @@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD: journal checksum error\n"); + printk(KERN_ERR "JBD2: journal checksum error\n"); goto out; } @@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c " + printk(KERN_ERR "JBD2: Cannot load crc32c " "driver.\n"); journal->j_chksum_driver = NULL; return 0; @@ -2645,7 +2643,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3929c50..3b6bb19 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EIO; - printk(KERN_ERR "JBD: Invalid " + printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", blocknr); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 7aa9a32..8360674 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -932,7 +932,7 @@ repeat: jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -1166,7 +1166,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * once a transaction -bzzz */ jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } handle->h_buffer_credits--; } @@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_running_transaction (%p, %u)", journal->j_devname, @@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "already on other transaction"); if (unlikely(jh->b_transaction != journal->j_committing_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_committing_transaction (%p, %u)", journal->j_devname, @@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) ret = -EINVAL; } if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_next_transaction (%llu, %p, %u) != " "transaction (%p, %u)", journal->j_devname, @@ -1373,7 +1376,6 @@ out_unlock_bh: jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); - WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile new file mode 100644 index 0000000..674337c --- /dev/null +++ b/fs/kernfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c new file mode 100644 index 0000000..5104cf5 --- /dev/null +++ b/fs/kernfs/dir.c @@ -0,0 +1,1073 @@ +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/hash.h> + +#include "kernfs-internal.h" + +DEFINE_MUTEX(kernfs_mutex); + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +/** + * kernfs_name_hash + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 1) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash != kn->hash) + return hash - kn->hash; + if (ns != kn->ns) + return ns - kn->ns; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + */ +static void kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + + rb_erase(&kn->rb, &kn->parent->dir.children); +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is NULL. + * + * RETURNS: + * Pointer to @kn on success, NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + int v; + + if (unlikely(!kn)) + return; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_release(&kn->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + /* + * atomic_dec_return() is a mb(), we'll always see the updated + * kn->u.completion. + */ + complete(kn->u.completion); +} + +/** + * kernfs_deactivate - deactivate kernfs_node + * @kn: kernfs_node to deactivate + * + * Deny new active references and drain existing ones. + */ +static void kernfs_deactivate(struct kernfs_node *kn) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int v; + + BUG_ON(!(kn->flags & KERNFS_REMOVED)); + + if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) + return; + + kn->u.completion = (void *)&wait; + + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + /* atomic_add_return() is a mb(), put_active() will always see + * the updated kn->u.completion. + */ + v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); + + if (v != KN_DEACTIVATED_BIAS) { + lock_contended(&kn->dep_map, _RET_IP_); + wait_for_completion(&wait); + } + + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", + parent ? parent->name : "", kn->name); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + if (!(kn->flags & KERNFS_STATIC_NAME)) + kfree(kn->name); + if (kn->iattr) { + if (kn->iattr->ia_secdata) + security_release_secctx(kn->iattr->ia_secdata, + kn->iattr->ia_secdata_len); + simple_xattrs_free(&kn->iattr->xattrs); + } + kfree(kn->iattr); + ida_simple_remove(&root->ino_ida, kn->ino); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + ida_destroy(&root->ino_ida); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (!dentry->d_inode) + goto out_bad_unlocked; + + kn = dentry->d_fsdata; + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deleted */ + if (kn->flags & KERNFS_REMOVED) + goto out_bad; + + /* The kernfs node has been moved? */ + if (dentry->d_parent->d_fsdata != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); +out_valid: + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + /* + * @dentry doesn't match the underlying kernfs node, drop the + * dentry and force lookup. If we have submounts we must allow the + * vfs caches to lie about the state of the filesystem to prevent + * leaks and other nasty things, so use check_submounts_and_drop() + * instead of d_drop(). + */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + + return 0; +} + +static void kernfs_dop_release(struct dentry *dentry) +{ + kernfs_put(dentry->d_fsdata); +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, + .d_release = kernfs_dop_release, +}; + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + const char *name, umode_t mode, + unsigned flags) +{ + char *dup_name = NULL; + struct kernfs_node *kn; + int ret; + + if (!(flags & KERNFS_STATIC_NAME)) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + return NULL; + } + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto err_out2; + kn->ino = ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, 0); + + kn->name = name; + kn->mode = mode; + kn->flags = flags | KERNFS_REMOVED; + + return kn; + + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree(dup_name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/** + * kernfs_addrm_start - prepare for kernfs_node add/remove + * @acxt: pointer to kernfs_addrm_cxt to be used + * + * This function is called when the caller is about to add or remove + * kernfs_node. This function acquires kernfs_mutex. @acxt is used + * to keep and pass context to other addrm functions. + * + * LOCKING: + * Kernel thread context (may sleep). kernfs_mutex is locked on + * return. + */ +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt) + __acquires(kernfs_mutex) +{ + memset(acxt, 0, sizeof(*acxt)); + + mutex_lock(&kernfs_mutex); +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @acxt: addrm context to use + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed + * the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + bool has_ns = kernfs_ns_enabled(parent); + struct kernfs_iattrs *ps_iattr; + int ret; + + if (has_ns != (bool)kn->ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name); + return -EINVAL; + } + + if (kernfs_type(parent) != KERNFS_DIR) + return -EINVAL; + + if (parent->flags & KERNFS_REMOVED) + return -ENOENT; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + return ret; + + /* Update timestamps on the parent */ + ps_iattr = parent->iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + /* Mark the entry added into directory tree */ + kn->flags &= ~KERNFS_REMOVED; + + return 0; +} + +/** + * kernfs_remove_one - remove kernfs_node from parent + * @acxt: addrm context to use + * @kn: kernfs_node to be removed + * + * Mark @kn removed and drop nlink of parent inode if @kn is a + * directory. @kn is unlinked from the children list. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be + * passed the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + */ +static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_iattrs *ps_iattr; + + /* + * Removal can be called multiple times on the same node. Only the + * first invocation is effective and puts the base ref. + */ + if (kn->flags & KERNFS_REMOVED) + return; + + if (kn->parent) { + kernfs_unlink_sibling(kn); + + /* Update timestamps on the parent */ + ps_iattr = kn->parent->iattr; + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + } + + kn->flags |= KERNFS_REMOVED; + kn->u.removed_list = acxt->removed; + acxt->removed = kn; +} + +/** + * kernfs_addrm_finish - finish up kernfs_node add/remove + * @acxt: addrm context to finish up + * + * Finish up kernfs_node add/remove. Resources acquired by + * kernfs_addrm_start() are released and removed kernfs_nodes are + * cleaned up. + * + * LOCKING: + * kernfs_mutex is released. + */ +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt) + __releases(kernfs_mutex) +{ + /* release resources acquired by kernfs_addrm_start() */ + mutex_unlock(&kernfs_mutex); + + /* kill removed kernfs_nodes */ + while (acxt->removed) { + struct kernfs_node *kn = acxt->removed; + + acxt->removed = kn->u.removed_list; + + kernfs_deactivate(kn); + kernfs_unmap_bin_file(kn); + kernfs_put(kn); + } +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. Returns pointer to + * the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_mutex); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @kdops: optional directory syscall operations for the hierarchy + * @priv: opaque data associated with the new directory + * + * Returns the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + ida_init(&root->ino_ida); + + kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + KERNFS_DIR); + if (!kn) { + ida_destroy(&root->ino_ida); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->flags &= ~KERNFS_REMOVED; + kn->priv = priv; + kn->dir.root = root; + + root->dir_ops = kdops; + root->kn = kn; + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + kernfs_remove(root->kn); /* will also free @root */ +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + void *priv, const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *ret; + struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *kn; + struct inode *inode; + const void *ns = NULL; + + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + + /* no such entry */ + if (!kn) { + ret = NULL; + goto out_unlock; + } + kernfs_get(kn); + dentry->d_fsdata = kn; + + /* attach dentry and inode */ + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_materialise_unique(dentry, inode); + out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; + + if (!kdops || !kdops->mkdir) + return -EPERM; + + return kdops->mkdir(parent, dentry->d_name.name, mode); +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rmdir) + return -EPERM; + + return kdops->rmdir(kn); +} + +static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rename) + return -EPERM; + + return kdops->rename(kn, new_parent, new_dentry->d_name.name); +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held(&kernfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_node *pos, *next; + + if (!kn) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + next = NULL; + do { + pos = next; + next = kernfs_next_descendant_post(pos, kn); + if (pos) + kernfs_remove_one(acxt, pos); + } while (next); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_addrm_cxt acxt; + + kernfs_addrm_start(&acxt); + __kernfs_remove(&acxt, kn); + kernfs_addrm_finish(&acxt); +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + kernfs_addrm_start(&acxt); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) + __kernfs_remove(&acxt, kn); + + kernfs_addrm_finish(&acxt); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + int error; + + mutex_lock(&kernfs_mutex); + + error = -ENOENT; + if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + if (kn->flags & KERNFS_STATIC_NAME) + kn->flags &= ~KERNFS_STATIC_NAME; + else + kfree(kn->name); + + kn->name = new_name; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + kernfs_put(kn->parent); + kn->ns = new_ns; + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->parent = new_parent; + kernfs_link_sibling(kn); + + error = 0; + out: + mutex_unlock(&kernfs_mutex); + return error; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct kernfs_node *kn) +{ + return (kn->mode >> 12) & 15; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = !(pos->flags & KERNFS_REMOVED) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries in the wrong namespace */ + while (pos && pos->ns != ns) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && pos->ns != ns); + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *pos = file->private_data; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->ino; + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + mutex_unlock(&kernfs_mutex); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + mutex_lock(&kernfs_mutex); + } + mutex_unlock(&kernfs_mutex); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, + int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = kernfs_dir_fop_llseek, +}; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c new file mode 100644 index 0000000..dbf397b --- /dev/null +++ b/fs/kernfs/file.c @@ -0,0 +1,867 @@ +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/pagemap.h> +#include <linux/sched.h> + +#include "kernfs-internal.h" + +/* + * There's one kernfs_open_file for each open file and one kernfs_open_node + * for each kernfs_node with one or more open files. + * + * kernfs_node->attr.open points to kernfs_open_node. attr.open is + * protected by kernfs_open_node_lock. + * + * filp->private_data points to seq_file whose ->private points to + * kernfs_open_file. kernfs_open_files are chained at + * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. + */ +static DEFINE_SPINLOCK(kernfs_open_node_lock); +static DEFINE_MUTEX(kernfs_open_file_mutex); + +struct kernfs_open_node { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ +}; + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; + } +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of->kn->attr.open->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, + char __user *user_buf, size_t count, + loff_t *ppos) +{ + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len < 0) + goto out_free; + + if (copy_to_user(user_buf, buf, len)) { + len = -EFAULT; + goto out_free; + } + + *ppos += len; + + out_free: + kfree(buf); + return len; +} + +/** + * kernfs_fop_read - kernfs vfs read callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + */ +static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + + if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read(file, user_buf, count, ppos); + else + return kernfs_file_direct_read(of, user_buf, count, ppos); +} + +/** + * kernfs_fop_write - kernfs vfs write callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len > 0) + *ppos += len; +out_free: + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +static int kernfs_vma_migrate(struct vm_area_struct *vma, + const nodemask_t *from, const nodemask_t *to, + unsigned long flags) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return 0; + + ret = 0; + if (of->vm_ops->migrate) + ret = of->vm_ops->migrate(vma, from, to, flags); + + kernfs_put_active(of->kn); + return ret; +} +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, + .migrate = kernfs_vma_migrate, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on, *new_on = NULL; + + retry: + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irq(&kernfs_open_node_lock); + + if (!kn->attr.open && new_on) { + kn->attr.open = new_on; + new_on = NULL; + } + + on = kn->attr.open; + if (on) { + atomic_inc(&on->refcnt); + list_add_tail(&of->list, &on->files); + } + + spin_unlock_irq(&kernfs_open_node_lock); + mutex_unlock(&kernfs_open_file_mutex); + + if (on) { + kfree(new_on); + return 0; + } + + /* not there, initialize a new one and retry */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) + return -ENOMEM; + + atomic_set(&new_on->refcnt, 0); + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + goto retry; +} + +/** + * kernfs_put_open_node - put kernfs_open_node + * @kn: target kernfs_nodet + * @of: associated kernfs_open_file + * + * Put @kn->attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void kernfs_put_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on = kn->attr.open; + unsigned long flags; + + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (of) + list_del(&of->list); + + if (atomic_dec_and_test(&on->refcnt)) + kn->attr.open = NULL; + else + on = NULL; + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + mutex_unlock(&kernfs_open_file_mutex); + + kfree(on); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* check perms and supported operations */ + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + ((struct seq_file *)file->private_data)->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_close; + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_close: + seq_release(inode, file); +err_free: + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_file *of = kernfs_of(filp); + + kernfs_put_open_node(kn, of); + seq_release(inode, filp); + kfree(of); + + return 0; +} + +void kernfs_unmap_bin_file(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + + if (!(kn->flags & KERNFS_HAS_MMAP)) + return; + + spin_lock_irq(&kernfs_open_node_lock); + on = kn->attr.open; + if (on) + atomic_inc(&on->refcnt); + spin_unlock_irq(&kernfs_open_node_lock); + if (!on) + return; + + mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&kernfs_open_file_mutex); + + kernfs_put_open_node(kn, NULL); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_node *on = kn->attr.open; + + /* need parent for the kobj, grab both */ + if (!kernfs_get_active(kn)) + goto trigger; + + poll_wait(filp, &on->poll, wait); + + kernfs_put_active(kn); + + if (of->event != atomic_read(&on->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + unsigned long flags; + + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) { + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + } + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read = kernfs_fop_read, + .write = kernfs_fop_write, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @static_name: don't copy file name + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + bool name_is_static, + struct lock_class_key *key) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + if (name_is_static) + flags |= KERNFS_STATIC_NAME; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "s_active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accesible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c new file mode 100644 index 0000000..e55126f --- /dev/null +++ b/fs/kernfs/inode.c @@ -0,0 +1,377 @@ +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/security.h> + +#include "kernfs-internal.h" + +static const struct address_space_operations kernfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info kernfs_bdi = { + .name = "kernfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, +}; + +void __init kernfs_inode_init(void) +{ + if (bdi_init(&kernfs_bdi)) + panic("failed to init kernfs_bdi"); +} + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + struct iattr *iattrs; + + if (kn->iattr) + return kn->iattr; + + kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + if (!kn->iattr) + return NULL; + iattrs = &kn->iattr->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = kn->mode; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + simple_xattrs_init(&kn->iattr->xattrs); + + return kn->iattr; +} + +static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + iattrs = &attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = kn->mode = mode; + } + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Returns 0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + + mutex_lock(&kernfs_mutex); + ret = __kernfs_setattr(kn, iattr); + mutex_unlock(&kernfs_mutex); + return ret; +} + +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct kernfs_node *kn = dentry->d_fsdata; + int error; + + if (!kn) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&kernfs_mutex); + return error; +} + +static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, + u32 *secdata_len) +{ + struct kernfs_iattrs *attrs; + void *old_secdata; + size_t old_secdata_len; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_secdata = attrs->ia_secdata; + old_secdata_len = attrs->ia_secdata_len; + + attrs->ia_secdata = *secdata; + attrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + void *secdata; + int error; + u32 secdata_len = 0; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; + } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + return simple_xattr_set(&attrs->xattrs, name, value, size, + flags); + } + + return -EINVAL; +} + +int kernfs_iop_removexattr(struct dentry *dentry, const char *name) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_remove(&attrs->xattrs, name); +} + +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, buf, size); +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(&attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) { + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, &attrs->ia_iattr); + security_inode_notifysecctx(inode, attrs->ia_secdata, + attrs->ia_secdata_len); + } + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->backing_dev_info = &kernfs_bdi; + inode->i_op = &kernfs_iops; + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kn->ino); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct inode *inode, int mask) +{ + struct kernfs_node *kn; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + return generic_permission(inode, mask); +} diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h new file mode 100644 index 0000000..eb536b7 --- /dev/null +++ b/fs/kernfs/kernfs-internal.h @@ -0,0 +1,122 @@ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> + * + * This file is released under the GPLv2. + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include <linux/lockdep.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/xattr.h> + +#include <linux/kernfs.h> + +struct kernfs_iattrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; + + struct simple_xattrs xattrs; +}; + +#define KN_DEACTIVATED_BIAS INT_MIN + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * Context structure to be used while adding/removing nodes. + */ +struct kernfs_addrm_cxt { + struct kernfs_node *removed; +}; + +/* + * mount.c + */ +struct kernfs_super_info { + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +extern struct kmem_cache *kernfs_node_cache; + +/* + * inode.c + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct inode *inode, int mask); +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int kernfs_iop_removexattr(struct dentry *dentry, const char *name); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +void kernfs_inode_init(void); + +/* + * dir.c + */ +extern struct mutex kernfs_mutex; +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn); +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +void kernfs_unmap_bin_file(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +#endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c new file mode 100644 index 0000000..0d6ce89 --- /dev/null +++ b/fs/kernfs/mount.c @@ -0,0 +1,165 @@ +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/pagemap.h> + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache; + +static const struct super_operations kernfs_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, +}; + +static int kernfs_fill_super(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = SYSFS_MAGIC; + sb->s_op = &kernfs_sops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&kernfs_mutex); + inode = kernfs_get_inode(sb, info->root->kn); + mutex_unlock(&kernfs_mutex); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + kernfs_get(info->root->kn); + root->d_fsdata = info->root->kn; + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, void *data) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = data; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_mount_ns - kernfs mount helper + * @fs_type: file_system_type of the fs being mounted + * @flags: mount flags specified for the mount + * @root: kernfs_root of the hierarchy being mounted + * @ns: optional namespace tag of the mount + * + * This is to be called from each kernfs user's file_system_type->mount() + * implementation, which should pass through the specified @fs_type and + * @flags, and specify the hierarchy and namespace tag to mount via @root + * and @ns, respectively. + * + * The return value can be passed to the vfs layer verbatim. + */ +struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, + struct kernfs_root *root, const void *ns) +{ + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->root = root; + info->ns = ns; + + sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + if (!sb->s_root) { + error = kernfs_fill_super(sb); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *root_kn = sb->s_root->d_fsdata; + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); + kernfs_put(root_kn); +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); + kernfs_inode_init(); +} diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c new file mode 100644 index 0000000..4d45705 --- /dev/null +++ b/fs/kernfs/symlink.c @@ -0,0 +1,151 @@ +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/namei.h> + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + struct kernfs_addrm_cxt acxt; + int error; + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + kernfs_addrm_start(&acxt); + error = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + strncpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct dentry *dentry, char *path) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + int error; + + mutex_lock(&kernfs_mutex); + error = kernfs_get_target_path(parent, target, path); + mutex_unlock(&kernfs_mutex); + + return error; +} + +static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = kernfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations kernfs_symlink_iops = { + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + .readlink = generic_readlink, + .follow_link = kernfs_iop_follow_link, + .put_link = kernfs_iop_put_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; diff --git a/fs/namespace.c b/fs/namespace.c index ac2ce8a..22e5367 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2790,6 +2790,8 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mountpoint_hashtable[u]); + kernfs_init(); + err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type) struct inode *inode = child->mnt_mountpoint->d_inode; if (!S_ISDIR(inode->i_mode)) goto next; - if (inode->i_nlink != 2) + if (inode->i_nlink > 2) goto next; } visible = true; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f6b486..a1a1916 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nilfs_clear_logs(&sci->sc_segbufs); - err = nilfs_segctor_extend_segments(sci, nilfs, nadd); - if (unlikely(err)) - return err; - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile index 8876ac1..6eff6e1 100644 --- a/fs/sysfs/Makefile +++ b/fs/sysfs/Makefile @@ -2,4 +2,4 @@ # Makefile for the sysfs virtual filesystem # -obj-y := inode.o file.o dir.o symlink.o mount.o group.o +obj-y := file.o dir.o symlink.o mount.o group.o diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5e73d66..ee0d761 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -13,465 +13,31 @@ #undef DEBUG #include <linux/fs.h> -#include <linux/mount.h> -#include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> -#include <linux/idr.h> -#include <linux/completion.h> -#include <linux/mutex.h> #include <linux/slab.h> -#include <linux/security.h> -#include <linux/hash.h> #include "sysfs.h" -DEFINE_MUTEX(sysfs_mutex); DEFINE_SPINLOCK(sysfs_symlink_target_lock); -#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb) - -static DEFINE_SPINLOCK(sysfs_ino_lock); -static DEFINE_IDA(sysfs_ino_ida); - -/** - * sysfs_name_hash - * @name: Null terminated string to hash - * @ns: Namespace tag to hash - * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) - */ -static unsigned int sysfs_name_hash(const char *name, const void *ns) -{ - unsigned long hash = init_name_hash(); - unsigned int len = strlen(name); - while (len--) - hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); - hash &= 0x7fffffffU; - /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ - if (hash < 1) - hash += 2; - if (hash >= INT_MAX) - hash = INT_MAX - 1; - return hash; -} - -static int sysfs_name_compare(unsigned int hash, const char *name, - const void *ns, const struct sysfs_dirent *sd) -{ - if (hash != sd->s_hash) - return hash - sd->s_hash; - if (ns != sd->s_ns) - return ns - sd->s_ns; - return strcmp(name, sd->s_name); -} - -static int sysfs_sd_compare(const struct sysfs_dirent *left, - const struct sysfs_dirent *right) -{ - return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns, - right); -} - -/** - * sysfs_link_sibling - link sysfs_dirent into sibling rbtree - * @sd: sysfs_dirent of interest - * - * Link @sd into its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * 0 on susccess -EEXIST on failure. - */ -static int sysfs_link_sibling(struct sysfs_dirent *sd) -{ - struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; - struct rb_node *parent = NULL; - - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs++; - - while (*node) { - struct sysfs_dirent *pos; - int result; - - pos = to_sysfs_dirent(*node); - parent = *node; - result = sysfs_sd_compare(sd, pos); - if (result < 0) - node = &pos->s_rb.rb_left; - else if (result > 0) - node = &pos->s_rb.rb_right; - else - return -EEXIST; - } - /* add new node and rebalance the tree */ - rb_link_node(&sd->s_rb, parent, node); - rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); - return 0; -} - -/** - * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree - * @sd: sysfs_dirent of interest - * - * Unlink @sd from its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - */ -static void sysfs_unlink_sibling(struct sysfs_dirent *sd) -{ - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs--; - - rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); -} - -/** - * sysfs_get_active - get an active reference to sysfs_dirent - * @sd: sysfs_dirent to get an active reference to - * - * Get an active reference of @sd. This function is noop if @sd - * is NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) -{ - if (unlikely(!sd)) - return NULL; - - if (!atomic_inc_unless_negative(&sd->s_active)) - return NULL; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); - return sd; -} - -/** - * sysfs_put_active - put an active reference to sysfs_dirent - * @sd: sysfs_dirent to put an active reference to - * - * Put an active reference to @sd. This function is noop if @sd - * is NULL. - */ -void sysfs_put_active(struct sysfs_dirent *sd) -{ - int v; - - if (unlikely(!sd)) - return; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_release(&sd->dep_map, 1, _RET_IP_); - v = atomic_dec_return(&sd->s_active); - if (likely(v != SD_DEACTIVATED_BIAS)) - return; - - /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->u.completion. - */ - complete(sd->u.completion); -} - -/** - * sysfs_deactivate - deactivate sysfs_dirent - * @sd: sysfs_dirent to deactivate - * - * Deny new active references and drain existing ones. - */ -static void sysfs_deactivate(struct sysfs_dirent *sd) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int v; - - BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); - - if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) - return; - - sd->u.completion = (void *)&wait; - - rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->u.completion. - */ - v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - - if (v != SD_DEACTIVATED_BIAS) { - lock_contended(&sd->dep_map, _RET_IP_); - wait_for_completion(&wait); - } - - lock_acquired(&sd->dep_map, _RET_IP_); - rwsem_release(&sd->dep_map, 1, _RET_IP_); -} - -static int sysfs_alloc_ino(unsigned int *pino) -{ - int ino, rc; - - retry: - spin_lock(&sysfs_ino_lock); - rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); - spin_unlock(&sysfs_ino_lock); - - if (rc == -EAGAIN) { - if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) - goto retry; - rc = -ENOMEM; - } - - *pino = ino; - return rc; -} - -static void sysfs_free_ino(unsigned int ino) -{ - spin_lock(&sysfs_ino_lock); - ida_remove(&sysfs_ino_ida, ino); - spin_unlock(&sysfs_ino_lock); -} - -void release_sysfs_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *parent_sd; - - repeat: - /* Moving/renaming is always done while holding reference. - * sd->s_parent won't change beneath us. - */ - parent_sd = sd->s_parent; - - WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), - "sysfs: free using entry: %s/%s\n", - parent_sd ? parent_sd->s_name : "", sd->s_name); - - if (sysfs_type(sd) == SYSFS_KOBJ_LINK) - sysfs_put(sd->s_symlink.target_sd); - if (sysfs_type(sd) & SYSFS_COPY_NAME) - kfree(sd->s_name); - if (sd->s_iattr && sd->s_iattr->ia_secdata) - security_release_secctx(sd->s_iattr->ia_secdata, - sd->s_iattr->ia_secdata_len); - kfree(sd->s_iattr); - sysfs_free_ino(sd->s_ino); - kmem_cache_free(sysfs_dir_cachep, sd); - - sd = parent_sd; - if (sd && atomic_dec_and_test(&sd->s_count)) - goto repeat; -} - -static int sysfs_dentry_delete(const struct dentry *dentry) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED)); -} - -static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct sysfs_dirent *sd; - int type; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - sd = dentry->d_fsdata; - mutex_lock(&sysfs_mutex); - - /* The sysfs dirent has been deleted */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - goto out_bad; - - /* The sysfs dirent has been moved? */ - if (dentry->d_parent->d_fsdata != sd->s_parent) - goto out_bad; - - /* The sysfs dirent has been renamed */ - if (strcmp(dentry->d_name.name, sd->s_name) != 0) - goto out_bad; - - /* The sysfs dirent has been moved to a different namespace */ - type = KOBJ_NS_TYPE_NONE; - if (sd->s_parent) { - type = sysfs_ns_type(sd->s_parent); - if (type != KOBJ_NS_TYPE_NONE && - sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) - goto out_bad; - } - - mutex_unlock(&sysfs_mutex); -out_valid: - return 1; -out_bad: - /* Remove the dentry from the dcache hashes. - * If this is a deleted dentry we use d_drop instead of d_delete - * so sysfs doesn't need to cope with negative dentries. - * - * If this is a dentry that has simply been renamed we - * use d_drop to remove it from the dcache lookup on its - * old parent. If this dentry persists later when a lookup - * is performed at its new name the dentry will be readded - * to the dcache hashes. - */ - mutex_unlock(&sysfs_mutex); - - /* If we have submounts we must allow the vfs caches - * to lie about the state of the filesystem to prevent - * leaks and other nasty things. - */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - - return 0; -} - -static void sysfs_dentry_release(struct dentry *dentry) -{ - sysfs_put(dentry->d_fsdata); -} - -const struct dentry_operations sysfs_dentry_ops = { - .d_revalidate = sysfs_dentry_revalidate, - .d_delete = sysfs_dentry_delete, - .d_release = sysfs_dentry_release, -}; - -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) -{ - char *dup_name = NULL; - struct sysfs_dirent *sd; - - if (type & SYSFS_COPY_NAME) { - name = dup_name = kstrdup(name, GFP_KERNEL); - if (!name) - return NULL; - } - - sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); - if (!sd) - goto err_out1; - - if (sysfs_alloc_ino(&sd->s_ino)) - goto err_out2; - - atomic_set(&sd->s_count, 1); - atomic_set(&sd->s_active, 0); - - sd->s_name = name; - sd->s_mode = mode; - sd->s_flags = type | SYSFS_FLAG_REMOVED; - - return sd; - - err_out2: - kmem_cache_free(sysfs_dir_cachep, sd); - err_out1: - kfree(dup_name); - return NULL; -} - -/** - * sysfs_addrm_start - prepare for sysfs_dirent add/remove - * @acxt: pointer to sysfs_addrm_cxt to be used - * - * This function is called when the caller is about to add or remove - * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used - * to keep and pass context to other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. - */ -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt) - __acquires(sysfs_mutex) -{ - memset(acxt, 0, sizeof(*acxt)); - - mutex_lock(&sysfs_mutex); -} - -/** - * __sysfs_add_one - add sysfs_dirent to parent without warning - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - struct sysfs_inode_attrs *ps_iattr; - int ret; - - if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, sd->s_name); - return -EINVAL; - } - - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = sysfs_get(parent_sd); - - ret = sysfs_link_sibling(sd); - if (ret) - return ret; - - /* Update timestamps on the parent */ - ps_iattr = parent_sd->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - /* Mark the entry added into directory tree */ - sd->s_flags &= ~SYSFS_FLAG_REMOVED; - - return 0; -} - /** * sysfs_pathname - return full path to sysfs dirent - * @sd: sysfs_dirent whose path we want + * @kn: kernfs_node whose path we want * @path: caller allocated buffer of size PATH_MAX * * Gives the name "/" to the sysfs_root entry; any path returned * is relative to wherever sysfs is mounted. */ -static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) +static char *sysfs_pathname(struct kernfs_node *kn, char *path) { - if (sd->s_parent) { - sysfs_pathname(sd->s_parent, path); + if (kn->parent) { + sysfs_pathname(kn->parent, path); strlcat(path, "/", PATH_MAX); } - strlcat(path, sd->s_name, PATH_MAX); + strlcat(path, kn->name, PATH_MAX); return path; } -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) +void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *path; @@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) } /** - * sysfs_add_one - add sysfs_dirent to parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - int ret; - - ret = __sysfs_add_one(acxt, sd, parent_sd); - - if (ret == -EEXIST) - sysfs_warn_dup(parent_sd, sd->s_name); - return ret; -} - -/** - * sysfs_remove_one - remove sysfs_dirent from parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be removed - * - * Mark @sd removed and drop nlink of parent inode if @sd is a - * directory. @sd is unlinked from the children list. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - */ -static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *ps_iattr; - - /* - * Removal can be called multiple times on the same node. Only the - * first invocation is effective and puts the base ref. - */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - return; - - sysfs_unlink_sibling(sd); - - /* Update timestamps on the parent */ - ps_iattr = sd->s_parent->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->u.removed_list = acxt->removed; - acxt->removed = sd; -} - -/** - * sysfs_addrm_finish - finish up sysfs_dirent add/remove - * @acxt: addrm context to finish up - * - * Finish up sysfs_dirent add/remove. Resources acquired by - * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. - * - * LOCKING: - * sysfs_mutex is released. - */ -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) - __releases(sysfs_mutex) -{ - /* release resources acquired by sysfs_addrm_start() */ - mutex_unlock(&sysfs_mutex); - - /* kill removed sysfs_dirents */ - while (acxt->removed) { - struct sysfs_dirent *sd = acxt->removed; - - acxt->removed = sd->u.removed_list; - - sysfs_deactivate(sd); - sysfs_unmap_bin_file(sd); - sysfs_put(sd); - } -} - -/** - * sysfs_find_dirent - find sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd. - * - * LOCKING: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct rb_node *node = parent_sd->s_dir.children.rb_node; - unsigned int hash; - - if (!!sysfs_ns_type(parent_sd) != !!ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, name); - return NULL; - } - - hash = sysfs_name_hash(name, ns); - while (node) { - struct sysfs_dirent *sd; - int result; - - sd = to_sysfs_dirent(node); - result = sysfs_name_compare(hash, name, ns, sd); - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return sd; - } - return NULL; -} - -/** - * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd and get - * it if found. - * - * LOCKING: - * Kernel thread context (may sleep). Grabs sysfs_mutex. - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct sysfs_dirent *sd; - - mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, name, ns); - sysfs_get(sd); - mutex_unlock(&sysfs_mutex); - - return sd; -} -EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns); - -static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, - const char *name, const void *ns, - struct sysfs_dirent **p_sd) -{ - umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - /* allocate */ - sd = sysfs_new_dirent(name, mode, SYSFS_DIR); - if (!sd) - return -ENOMEM; - - sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); - sd->s_ns = ns; - sd->s_dir.kobj = kobj; - - /* link in */ - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, parent_sd); - sysfs_addrm_finish(&acxt); - - if (rc == 0) - *p_sd = sd; - else - sysfs_put(sd); - - return rc; -} - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd) -{ - return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, name, NULL, p_sd); -} - -/** - * sysfs_read_ns_type: return associated ns_type - * @kobj: the kobject being queried - * - * Each kobject can be tagged with exactly one namespace type - * (i.e. network or user). Return the ns_type associated with - * this object if any - */ -static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) -{ - const struct kobj_ns_type_operations *ops; - enum kobj_ns_type type; - - ops = kobj_child_ns_ops(kobj); - if (!ops) - return KOBJ_NS_TYPE_NONE; - - type = ops->type; - BUG_ON(type <= KOBJ_NS_TYPE_NONE); - BUG_ON(type >= KOBJ_NS_TYPES); - BUG_ON(!kobj_ns_type_registered(type)); - - return type; -} - -/** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { - enum kobj_ns_type type; - struct sysfs_dirent *parent_sd, *sd; - int error = 0; + struct kernfs_node *parent, *kn; BUG_ON(!kobj); if (kobj->parent) - parent_sd = kobj->parent->sd; + parent = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent = sysfs_root_kn; - if (!parent_sd) + if (!parent) return -ENOENT; - type = sysfs_read_ns_type(kobj); - - error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); - if (!error) - kobj->sd = sd; - return error; -} - -static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret = NULL; - struct dentry *parent = dentry->d_parent; - struct sysfs_dirent *parent_sd = parent->d_fsdata; - struct sysfs_dirent *sd; - struct inode *inode; - enum kobj_ns_type type; - const void *ns; - - mutex_lock(&sysfs_mutex); - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dir->i_sb)->ns[type]; - - sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns); - - /* no such entry */ - if (!sd) { - ret = ERR_PTR(-ENOENT); - goto out_unlock; - } - dentry->d_fsdata = sysfs_get(sd); - - /* attach dentry and inode */ - inode = sysfs_get_inode(dir->i_sb, sd); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; - } - - /* instantiate and hash dentry */ - ret = d_materialise_unique(dentry, inode); - out_unlock: - mutex_unlock(&sysfs_mutex); - return ret; -} - -const struct inode_operations sysfs_dir_inode_operations = { - .lookup = sysfs_lookup, - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos) -{ - struct sysfs_dirent *last; - - while (true) { - struct rb_node *rbn; - - last = pos; - - if (sysfs_type(pos) != SYSFS_DIR) - break; - - rbn = rb_first(&pos->s_dir.children); - if (!rbn) - break; - - pos = to_sysfs_dirent(rbn); - } - - return last; -} - -/** - * sysfs_next_descendant_post - find the next descendant for post-order walk - * @pos: the current position (%NULL to initiate traversal) - * @root: sysfs_dirent whose descendants to walk - * - * Find the next descendant to visit for post-order traversal of @root's - * descendants. @root is included in the iteration and the last node to be - * visited. - */ -static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos, - struct sysfs_dirent *root) -{ - struct rb_node *rbn; - - lockdep_assert_held(&sysfs_mutex); - - /* if first iteration, visit leftmost descendant which may be root */ - if (!pos) - return sysfs_leftmost_descendant(root); - - /* if we visited @root, we're done */ - if (pos == root) - return NULL; - - /* if there's an unvisited sibling, visit its leftmost descendant */ - rbn = rb_next(&pos->s_rb); - if (rbn) - return sysfs_leftmost_descendant(to_sysfs_dirent(rbn)); - - /* no sibling left, visit parent */ - return pos->s_parent; -} - -static void __sysfs_remove(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_dirent *pos, *next; - - if (!sd) - return; - - pr_debug("sysfs %s: removing\n", sd->s_name); - - next = NULL; - do { - pos = next; - next = sysfs_next_descendant_post(pos, sd); - if (pos) - sysfs_remove_one(acxt, pos); - } while (next); -} - -/** - * sysfs_remove - remove a sysfs_dirent recursively - * @sd: the sysfs_dirent to remove - * - * Remove @sd along with all its subdirectories and files. - */ -void sysfs_remove(struct sysfs_dirent *sd) -{ - struct sysfs_addrm_cxt acxt; - - sysfs_addrm_start(&acxt); - __sysfs_remove(&acxt, sd); - sysfs_addrm_finish(&acxt); -} - -/** - * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it - * @dir_sd: parent of the target - * @name: name of the sysfs_dirent to remove - * @ns: namespace tag of the sysfs_dirent to remove - * - * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove - * it. Returns 0 on success, -ENOENT if such entry doesn't exist. - */ -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns) -{ - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - - if (!dir_sd) { - WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", - name); - return -ENOENT; + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), + S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, kobject_name(kobj)); + return PTR_ERR(kn); } - sysfs_addrm_start(&acxt); - - sd = sysfs_find_dirent(dir_sd, name, ns); - if (sd) - __sysfs_remove(&acxt, sd); - - sysfs_addrm_finish(&acxt); - - if (sd) - return 0; - else - return -ENOENT; + kobj->sd = kn; + return 0; } /** @@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, */ void sysfs_remove_dir(struct kobject *kobj) { - struct sysfs_dirent *sd = kobj->sd; + struct kernfs_node *kn = kobj->sd; /* * In general, kboject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no - * control over removal. @kobj->sd may be removed anytime and - * symlink code may end up dereferencing an already freed sd. + * control over removal. @kobj->sd may be removed anytime + * and symlink code may end up dereferencing an already freed node. * - * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation - * against symlink operations so that symlink code can safely - * dereference @kobj->sd. + * sysfs_symlink_target_lock synchronizes @kobj->sd + * disassociation against symlink operations so that symlink code + * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); - if (sd) { - WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); - sysfs_remove(sd); + if (kn) { + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); + kernfs_remove(kn); } } -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns) -{ - int error; - - mutex_lock(&sysfs_mutex); - - error = 0; - if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && - (strcmp(sd->s_name, new_name) == 0)) - goto out; /* nothing to rename */ - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_name, new_ns)) - goto out; - - /* rename sysfs_dirent */ - if (strcmp(sd->s_name, new_name) != 0) { - error = -ENOMEM; - new_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out; - - kfree(sd->s_name); - sd->s_name = new_name; - } - - /* - * Move to the appropriate place in the appropriate directories rbtree. - */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_ns = new_ns; - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - - error = 0; - out: - mutex_unlock(&sysfs_mutex); - return error; -} - int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { - struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + struct kernfs_node *parent = kobj->sd->parent; - return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); + return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { - struct sysfs_dirent *sd = kobj->sd; - struct sysfs_dirent *new_parent_sd; + struct kernfs_node *kn = kobj->sd; + struct kernfs_node *new_parent; - BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? - new_parent_kobj->sd : &sysfs_root; + BUG_ON(!kn->parent); + new_parent = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : sysfs_root_kn; - return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); + return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } - -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct sysfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - -static int sysfs_dir_release(struct inode *inode, struct file *filp) -{ - sysfs_put(filp->private_data); - return 0; -} - -static struct sysfs_dirent *sysfs_dir_pos(const void *ns, - struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos) -{ - if (pos) { - int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && - pos->s_parent == parent_sd && - hash == pos->s_hash; - sysfs_put(pos); - if (!valid) - pos = NULL; - } - if (!pos && (hash > 1) && (hash < INT_MAX)) { - struct rb_node *node = parent_sd->s_dir.children.rb_node; - while (node) { - pos = to_sysfs_dirent(node); - - if (hash < pos->s_hash) - node = node->rb_left; - else if (hash > pos->s_hash) - node = node->rb_right; - else - break; - } - } - /* Skip over entries in the wrong namespace */ - while (pos && pos->s_ns != ns) { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } - return pos; -} - -static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, - struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) -{ - pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); - return pos; -} - -static int sysfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent *parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = file->private_data; - enum kobj_ns_type type; - const void *ns; - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dentry->d_sb)->ns[type]; - - if (!dir_emit_dots(file, ctx)) - return 0; - mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); - pos; - pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { - const char *name = pos->s_name; - unsigned int type = dt_type(pos); - int len = strlen(name); - ino_t ino = pos->s_ino; - ctx->pos = pos->s_hash; - file->private_data = sysfs_get(pos); - - mutex_unlock(&sysfs_mutex); - if (!dir_emit(ctx, name, len, ino, type)) - return 0; - mutex_lock(&sysfs_mutex); - } - mutex_unlock(&sysfs_mutex); - file->private_data = NULL; - ctx->pos = INT_MAX; - return 0; -} - -static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *inode = file_inode(file); - loff_t ret; - - mutex_lock(&inode->i_mutex); - ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -const struct file_operations sysfs_dir_operations = { - .read = generic_read_dir, - .iterate = sysfs_readdir, - .release = sysfs_dir_release, - .llseek = sysfs_dir_llseek, -}; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 35e7d08..810cf6e 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,70 +14,23 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> -#include <linux/fsnotify.h> -#include <linux/namei.h> -#include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> -#include <linux/limits.h> -#include <linux/uaccess.h> #include <linux/seq_file.h> -#include <linux/mm.h> #include "sysfs.h" +#include "../kernfs/kernfs-internal.h" /* - * There's one sysfs_open_file for each open file and one sysfs_open_dirent - * for each sysfs_dirent with one or more open files. - * - * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is - * protected by sysfs_open_dirent_lock. - * - * filp->private_data points to seq_file whose ->private points to - * sysfs_open_file. sysfs_open_files are chained at - * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex. - */ -static DEFINE_SPINLOCK(sysfs_open_dirent_lock); -static DEFINE_MUTEX(sysfs_open_file_mutex); - -struct sysfs_open_dirent { - atomic_t refcnt; - atomic_t event; - wait_queue_head_t poll; - struct list_head files; /* goes through sysfs_open_file.list */ -}; - -struct sysfs_open_file { - struct sysfs_dirent *sd; - struct file *file; - struct mutex mutex; - int event; - struct list_head list; - - bool mmapped; - const struct vm_operations_struct *vm_ops; -}; - -static bool sysfs_is_bin(struct sysfs_dirent *sd) -{ - return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR; -} - -static struct sysfs_open_file *sysfs_of(struct file *file) -{ - return ((struct seq_file *)file->private_data)->private; -} - -/* - * Determine ktype->sysfs_ops for the given sysfs_dirent. This function + * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ -static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) +static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { - struct kobject *kobj = sd->s_parent->s_dir.kobj; + struct kobject *kobj = kn->parent->priv; - if (!sysfs_ignore_lockdep(sd)) - lockdep_assert_held(sd); + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } @@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ -static int sysfs_seq_show(struct seq_file *sf, void *v) +static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { - struct sysfs_open_file *of = sf->private; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops; - char *buf; + struct kernfs_open_file *of = sf->private; + struct kobject *kobj = of->kn->parent->priv; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; + char *buf; /* acquire buffer and ensure that it's >= PAGE_SIZE */ count = seq_get_buf(sf, &buf); @@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) } /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. + * Invoke show(). Control may reach here via seq file lseek even + * if @ops->show() isn't implemented. */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; + if (ops->show) { + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; } - of->event = atomic_read(&of->sd->s_attr.open->event); - - /* - * Lookup @ops and invoke show(). Control may reach here via seq - * file lseek even if @ops->show() isn't implemented. - */ - ops = sysfs_file_ops(of->sd); - if (ops->show) - count = ops->show(kobj, of->sd->s_attr.attr, buf); - else - count = 0; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - return count; - /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. @@ -144,726 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) return 0; } -/* - * Read method for bin files. As reading a bin file can have side-effects, - * the exact offset and bytes specified in read(2) call should be passed to - * the read callback making it difficult to use seq_file. Implement - * simplistic custom buffering for bin files. - */ -static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf, - size_t bytes, loff_t *off) +static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - loff_t size = file_inode(file)->i_size; - int count = min_t(size_t, bytes, PAGE_SIZE); - loff_t offs = *off; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (!bytes) + if (!count) return 0; if (size) { - if (offs > size) + if (pos > size) return 0; - if (offs + count > size) - count = size - offs; - } - - buf = kmalloc(count, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - /* need of->sd for battr, its parent for kobj */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - count = -ENODEV; - mutex_unlock(&of->mutex); - goto out_free; - } - - if (battr->read) - count = battr->read(file, kobj, battr, buf, offs, count); - else - count = -EIO; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - goto out_free; - - if (copy_to_user(userbuf, buf, count)) { - count = -EFAULT; - goto out_free; + if (pos + count > size) + count = size - pos; } - pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); - - *off = offs + count; + if (!battr->read) + return -EIO; - out_free: - kfree(buf); - return count; + return battr->read(of->file, kobj, battr, buf, pos, count); } -/** - * flush_write_buffer - push buffer to kobject - * @of: open file - * @buf: data buffer for file - * @off: file offset to write to - * @count: number of bytes - * - * Get the correct pointers for the kobject and the attribute we're dealing - * with, then call the store() method for it with @buf. - */ -static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off, - size_t count) +/* kernfs write callback for regular sysfs files */ +static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc = 0; - - /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. - */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; - } + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; - if (sysfs_is_bin(of->sd)) { - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - - rc = -EIO; - if (battr->write) - rc = battr->write(of->file, kobj, battr, buf, off, - count); - } else { - const struct sysfs_ops *ops = sysfs_file_ops(of->sd); - - rc = ops->store(kobj, of->sd->s_attr.attr, buf, count); - } - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); + if (!count) + return 0; - return rc; + return ops->store(kobj, of->kn->priv, buf, count); } -/** - * sysfs_write_file - write an attribute - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Copy data in from userland and pass it to the matching - * sysfs_ops->store() by invoking flush_write_buffer(). - * - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come on - * the first write. Hint: if you're writing a value, first read the file, - * modify only the the value you're changing, then write entire buffer - * back. - */ -static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +/* kernfs write callback for bin sysfs files */ +static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - ssize_t len = min_t(size_t, count, PAGE_SIZE); - loff_t size = file_inode(file)->i_size; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (sysfs_is_bin(of->sd) && size) { - if (size <= *ppos) + if (size) { + if (size <= pos) return 0; - len = min_t(ssize_t, len, size - *ppos); + count = min_t(ssize_t, count, size - pos); } - - if (!len) + if (!count) return 0; - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + if (!battr->write) + return -EIO; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - - len = flush_write_buffer(of, buf, *ppos, len); - if (len > 0) - *ppos += len; -out_free: - kfree(buf); - return len; -} - -static void sysfs_bin_vma_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - - if (!of->vm_ops) - return; - - if (!sysfs_get_active(of->sd)) - return; - - if (of->vm_ops->open) - of->vm_ops->open(vma); - - sysfs_put_active(of->sd); + return battr->write(of->file, kobj, battr, buf, pos, count); } -static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, + struct vm_area_struct *vma) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; - if (!of->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = VM_FAULT_SIGBUS; - if (of->vm_ops->fault) - ret = of->vm_ops->fault(vma, vmf); - - sysfs_put_active(of->sd); - return ret; + return battr->mmap(of->file, kobj, battr, vma); } -static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return VM_FAULT_SIGBUS; + struct kernfs_node *kn = kobj->sd, *tmp; - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = 0; - if (of->vm_ops->page_mkwrite) - ret = of->vm_ops->page_mkwrite(vma, vmf); + if (kn && dir) + kn = kernfs_find_and_get(kn, dir); else - file_update_time(file); - - sysfs_put_active(of->sd); - return ret; -} - -static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return -EINVAL; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = -EINVAL; - if (of->vm_ops->access) - ret = of->vm_ops->access(vma, addr, buf, len, write); - - sysfs_put_active(of->sd); - return ret; -} - -#ifdef CONFIG_NUMA -static int sysfs_bin_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - sysfs_put_active(of->sd); - return ret; -} - -static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!sysfs_get_active(of->sd)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - sysfs_put_active(of->sd); - return pol; -} - -static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return 0; - - ret = 0; - if (of->vm_ops->migrate) - ret = of->vm_ops->migrate(vma, from, to, flags); - - sysfs_put_active(of->sd); - return ret; -} -#endif - -static const struct vm_operations_struct sysfs_bin_vm_ops = { - .open = sysfs_bin_vma_open, - .fault = sysfs_bin_fault, - .page_mkwrite = sysfs_bin_page_mkwrite, - .access = sysfs_bin_access, -#ifdef CONFIG_NUMA - .set_policy = sysfs_bin_set_policy, - .get_policy = sysfs_bin_get_policy, - .migrate = sysfs_bin_migrate, -#endif -}; - -static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc; - - mutex_lock(&of->mutex); - - /* need of->sd for battr, its parent for kobj */ - rc = -ENODEV; - if (!sysfs_get_active(of->sd)) - goto out_unlock; - - if (!battr->mmap) - goto out_put; - - rc = battr->mmap(file, kobj, battr, vma); - if (rc) - goto out_put; - - /* - * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() - * to satisfy versions of X which crash if the mmap fails: that - * substitutes a new vm_file, and we don't then want bin_vm_ops. - */ - if (vma->vm_file != file) - goto out_put; - - rc = -EINVAL; - if (of->mmapped && of->vm_ops != vma->vm_ops) - goto out_put; + kernfs_get(kn); - /* - * It is not possible to successfully wrap close. - * So error if someone is trying to use close. - */ - rc = -EINVAL; - if (vma->vm_ops && vma->vm_ops->close) - goto out_put; - - rc = 0; - of->mmapped = 1; - of->vm_ops = vma->vm_ops; - vma->vm_ops = &sysfs_bin_vm_ops; -out_put: - sysfs_put_active(of->sd); -out_unlock: - mutex_unlock(&of->mutex); - - return rc; -} - -/** - * sysfs_get_open_dirent - get or create sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: sysfs_open_file for this instance of open - * - * If @sd->s_attr.open exists, increment its reference count; - * otherwise, create one. @of is chained to the files list. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int sysfs_get_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od, *new_od = NULL; - - retry: - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irq(&sysfs_open_dirent_lock); - - if (!sd->s_attr.open && new_od) { - sd->s_attr.open = new_od; - new_od = NULL; + if (kn && attr) { + tmp = kernfs_find_and_get(kn, attr); + kernfs_put(kn); + kn = tmp; } - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->refcnt); - list_add_tail(&of->list, &od->files); - } - - spin_unlock_irq(&sysfs_open_dirent_lock); - mutex_unlock(&sysfs_open_file_mutex); - - if (od) { - kfree(new_od); - return 0; + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } +} +EXPORT_SYMBOL_GPL(sysfs_notify); - /* not there, initialize a new one and retry */ - new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); - if (!new_od) - return -ENOMEM; +static const struct kernfs_ops sysfs_file_kfops_empty = { +}; - atomic_set(&new_od->refcnt, 0); - atomic_set(&new_od->event, 1); - init_waitqueue_head(&new_od->poll); - INIT_LIST_HEAD(&new_od->files); - goto retry; -} +static const struct kernfs_ops sysfs_file_kfops_ro = { + .seq_show = sysfs_kf_seq_show, +}; -/** - * sysfs_put_open_dirent - put sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: associated sysfs_open_file - * - * Put @sd->s_attr.open and unlink @of from the files list. If - * reference count reaches zero, disassociate and free it. - * - * LOCKING: - * None. - */ -static void sysfs_put_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od = sd->s_attr.open; - unsigned long flags; +static const struct kernfs_ops sysfs_file_kfops_wo = { + .write = sysfs_kf_write, +}; - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); +static const struct kernfs_ops sysfs_file_kfops_rw = { + .seq_show = sysfs_kf_seq_show, + .write = sysfs_kf_write, +}; - if (of) - list_del(&of->list); +static const struct kernfs_ops sysfs_bin_kfops_ro = { + .read = sysfs_kf_bin_read, +}; - if (atomic_dec_and_test(&od->refcnt)) - sd->s_attr.open = NULL; - else - od = NULL; +static const struct kernfs_ops sysfs_bin_kfops_wo = { + .write = sysfs_kf_bin_write, +}; - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); - mutex_unlock(&sysfs_open_file_mutex); +static const struct kernfs_ops sysfs_bin_kfops_rw = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, +}; - kfree(od); -} +static const struct kernfs_ops sysfs_bin_kfops_mmap = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, + .mmap = sysfs_kf_bin_mmap, +}; -static int sysfs_open_file(struct inode *inode, struct file *file) +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t mode, const void *ns) { - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_open_file *of; - bool has_read, has_write; - int error = -EACCES; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + loff_t size; - if (sysfs_is_bin(attr_sd)) { - struct bin_attribute *battr = attr_sd->s_attr.bin_attr; - - has_read = battr->read || battr->mmap; - has_write = battr->write || battr->mmap; - } else { - const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); + if (!is_bin) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; /* every kobject with an attribute needs a ktype assigned */ - if (WARN(!ops, KERN_ERR + if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) - goto err_out; - - has_read = ops->show; - has_write = ops->store; - } - - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; - - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; - - /* allocate a sysfs_open_file for the file */ - error = -ENOMEM; - of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); - if (!of) - goto err_out; - - /* - * The following is done to give a different lockdep key to - * @of->mutex for files which implement mmap. This is a rather - * crude way to avoid false positive lockdep warning around - * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and - * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under - * which mm->mmap_sem nests, while holding @of->mutex. As each - * open file has a separate mutex, it's okay as long as those don't - * happen on the same file. At this point, we can't easily give - * each file a separate locking class. Let's differentiate on - * whether the file is bin or not for now. - */ - if (sysfs_is_bin(attr_sd)) - mutex_init(&of->mutex); - else - mutex_init(&of->mutex); - - of->sd = attr_sd; - of->file = file; - - /* - * Always instantiate seq_file even if read access doesn't use - * seq_file or is not requested. This unifies private data access - * and readable regular files are the vast majority anyway. - */ - if (sysfs_is_bin(attr_sd)) - error = single_open(file, NULL, of); - else - error = single_open(file, sysfs_seq_show, of); - if (error) - goto err_free; - - /* seq_file clears PWRITE unconditionally, restore it if WRITE */ - if (file->f_mode & FMODE_WRITE) - file->f_mode |= FMODE_PWRITE; - - /* make sure we have open dirent struct */ - error = sysfs_get_open_dirent(attr_sd, of); - if (error) - goto err_close; - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; - -err_close: - single_release(inode, file); -err_free: - kfree(of); -err_out: - sysfs_put_active(attr_sd); - return error; -} - -static int sysfs_release(struct inode *inode, struct file *filp) -{ - struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_file *of = sysfs_of(filp); - - sysfs_put_open_dirent(sd, of); - single_release(inode, filp); - kfree(of); - - return 0; -} - -void sysfs_unmap_bin_file(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - struct sysfs_open_file *of; - - if (!sysfs_is_bin(sd)) - return; - - spin_lock_irq(&sysfs_open_dirent_lock); - od = sd->s_attr.open; - if (od) - atomic_inc(&od->refcnt); - spin_unlock_irq(&sysfs_open_dirent_lock); - if (!od) - return; - - mutex_lock(&sysfs_open_file_mutex); - list_for_each_entry(of, &od->files, list) { - struct inode *inode = file_inode(of->file); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); + return -EINVAL; + + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_file_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_file_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_file_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = PAGE_SIZE; + } else { + struct bin_attribute *battr = (void *)attr; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = battr->size; } - mutex_unlock(&sysfs_open_file_mutex); - - sysfs_put_open_dirent(sd, NULL); -} - -/* Sysfs attribute files are pollable. The idea is that you read - * the content and then you use 'poll' or 'select' to wait for - * the content to change. When the content changes (assuming the - * manager for the kobject supports notification), poll will - * return POLLERR|POLLPRI, and select will return the fd whether - * it is waiting for read, write, or exceptions. - * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, or seek to 0 and read again. - * Reminder: this only works for attributes which actively support - * it, and it is not possible to test an attribute from userspace - * to see if it supports poll (Neither 'poll' nor 'select' return - * an appropriate error code). When in doubt, set a suitable timeout value. - */ -static unsigned int sysfs_poll(struct file *filp, poll_table *wait) -{ - struct sysfs_open_file *of = sysfs_of(filp); - struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_dirent *od = attr_sd->s_attr.open; - - /* need parent for the kobj, grab both */ - if (!sysfs_get_active(attr_sd)) - goto trigger; - - poll_wait(filp, &od->poll, wait); - sysfs_put_active(attr_sd); - - if (of->event != atomic_read(&od->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|POLLERR|POLLPRI; -} - -void sysfs_notify_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - unsigned long flags; - - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - - if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + kn = __kernfs_create_file(parent, attr->name, mode, size, ops, + (void *)attr, ns, true, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); } - - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); -} -EXPORT_SYMBOL_GPL(sysfs_notify_dirent); - -void sysfs_notify(struct kobject *k, const char *dir, const char *attr) -{ - struct sysfs_dirent *sd = k->sd; - - mutex_lock(&sysfs_mutex); - - if (sd && dir) - sd = sysfs_find_dirent(sd, dir, NULL); - if (sd && attr) - sd = sysfs_find_dirent(sd, attr, NULL); - if (sd) - sysfs_notify_dirent(sd); - - mutex_unlock(&sysfs_mutex); -} -EXPORT_SYMBOL_GPL(sysfs_notify); - -const struct file_operations sysfs_file_operations = { - .read = seq_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -const struct file_operations sysfs_bin_operations = { - .read = sysfs_bin_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .mmap = sysfs_bin_mmap, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, - umode_t amode, const void *ns) -{ - umode_t mode = (amode & S_IALLUGO) | S_IFREG; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - sd = sysfs_new_dirent(attr->name, mode, type); - if (!sd) - return -ENOMEM; - - sd->s_ns = ns; - sd->s_attr.attr = (void *)attr; - sysfs_dirent_init_lockdep(sd); - - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, dir_sd); - sysfs_addrm_finish(&acxt); - - if (rc) - sysfs_put(sd); - - return rc; + return 0; } - -int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, - int type) +int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, + bool is_bin) { - return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); + return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); } /** @@ -877,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, - attr->mode, ns); + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); @@ -906,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } - if (!dir_sd) + if (!parent) return -ENOENT; - error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); - sysfs_put(dir_sd); + error = sysfs_add_file(parent, attr, false); + kernfs_put(parent); return error; } @@ -934,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; struct iattr newattrs; int rc; - mutex_lock(&sysfs_mutex); - - rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, attr->name, NULL); - if (!sd) - goto out; + kn = kernfs_find_and_get(kobj->sd, attr->name); + if (!kn) + return -ENOENT; - newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; - rc = sysfs_sd_setattr(sd, &newattrs); - out: - mutex_unlock(&sysfs_mutex); + rc = kernfs_setattr(kn, &newattrs); + + kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); @@ -966,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { - struct sysfs_dirent *dir_sd = kobj->sd; + struct kernfs_node *parent = kobj->sd; - sysfs_hash_and_remove(dir_sd, attr->name, ns); + kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); @@ -989,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, attr->name, NULL); - sysfs_put(dir_sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (parent) { + kernfs_remove_by_name(parent, attr->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); @@ -1012,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); + return sysfs_add_file(kobj->sd, &attr->attr, true); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); @@ -1024,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { - sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); + kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 1898a10..6b57938 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp) { struct attribute *const *attr; @@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->attrs) for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) sysfs_remove_bin_file(kobj, *bin_attr); } -static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static int create_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp, int update) { struct attribute *const *attr; @@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * re-adding (if required) the file. */ if (update) - sysfs_hash_and_remove(dir_sd, (*attr)->name, - NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } - error = sysfs_add_file_mode_ns(dir_sd, *attr, - SYSFS_KOBJ_ATTR, + error = sysfs_add_file_mode_ns(parent, *attr, false, (*attr)->mode | mode, NULL); if (unlikely(error)) break; } if (error) { - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); goto exit; } } @@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, break; } if (error) - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); } exit: return error; @@ -88,7 +86,7 @@ exit: static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; int error; BUG_ON(!kobj || (!update && !kobj->sd)); @@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update, return -EINVAL; } if (grp->name) { - error = sysfs_create_subdir(kobj, grp->name, &sd); - if (error) - return error; + kn = kernfs_create_dir(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, kobj); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(kobj->sd, grp->name); + return PTR_ERR(kn); + } } else - sd = kobj->sd; - sysfs_get(sd); - error = create_files(sd, kobj, grp, update); + kn = kobj->sd; + kernfs_get(kn); + error = create_files(kn, kobj, grp, update); if (error) { if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); } - sysfs_put(sd); + kernfs_put(kn); return error; } @@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd = kobj->sd; - struct sysfs_dirent *sd; + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, grp->name); - if (!sd) { - WARN(!sd, KERN_WARNING + kn = kernfs_find_and_get(parent, grp->name); + if (!kn) { + WARN(!kn, KERN_WARNING "sysfs group %p not found for kobject '%s'\n", grp, kobject_name(kobj)); return; } - } else - sd = sysfs_get(dir_sd); + } else { + kn = parent; + kernfs_get(kn); + } - remove_files(sd, kobj, grp); + remove_files(kn, kobj, grp); if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); - sysfs_put(sd); + kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); @@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; struct attribute *const *attr; int i; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (!parent) return -ENOENT; for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + error = sysfs_add_file(parent, *attr, false); if (error) { while (--i >= 0) - sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); + kernfs_remove_by_name(parent, (*--attr)->name); } - sysfs_put(dir_sd); + kernfs_put(parent); return error; } @@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; struct attribute *const *attr; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (dir_sd) { + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (parent) { for (attr = grp->attrs; *attr; ++attr) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); - sysfs_put(dir_sd); + kernfs_remove_by_name(parent, (*attr)->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); @@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, group_name); + if (!parent) return -ENOENT; - error = sysfs_create_link_sd(dir_sd, target, link_name); - sysfs_put(dir_sd); + error = sysfs_create_link_sd(parent, target, link_name); + kernfs_put(parent); return error; } @@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, link_name, NULL); - sysfs_put(dir_sd); + parent = kernfs_find_and_get(kobj->sd, group_name); + if (parent) { + kernfs_remove_by_name(parent, link_name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c deleted file mode 100644 index 1750f79..0000000 --- a/fs/sysfs/inode.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * fs/sysfs/inode.c - basic sysfs inode and dentry operations - * - * Copyright (c) 2001-3 Patrick Mochel - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/pagemap.h> -#include <linux/namei.h> -#include <linux/backing-dev.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sysfs.h> -#include <linux/xattr.h> -#include <linux/security.h> -#include "sysfs.h" - -static const struct address_space_operations sysfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - -static struct backing_dev_info sysfs_backing_dev_info = { - .name = "sysfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static const struct inode_operations sysfs_inode_operations = { - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -int __init sysfs_inode_init(void) -{ - return bdi_init(&sysfs_backing_dev_info); -} - -static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *attrs; - struct iattr *iattrs; - - attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); - if (!attrs) - return NULL; - iattrs = &attrs->ia_iattr; - - /* assign default attributes */ - iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = GLOBAL_ROOT_UID; - iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; - - return attrs; -} - -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) -{ - struct sysfs_inode_attrs *sd_attrs; - struct iattr *iattrs; - unsigned int ia_valid = iattr->ia_valid; - - sd_attrs = sd->s_iattr; - - if (!sd_attrs) { - /* setting attributes for the first time, allocate now */ - sd_attrs = sysfs_init_inode_attrs(sd); - if (!sd_attrs) - return -ENOMEM; - sd->s_iattr = sd_attrs; - } - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } - return 0; -} - -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct sysfs_dirent *sd = dentry->d_fsdata; - int error; - - if (!sd) - return -EINVAL; - - mutex_lock(&sysfs_mutex); - error = inode_change_ok(inode, iattr); - if (error) - goto out; - - error = sysfs_sd_setattr(sd, iattr); - if (error) - goto out; - - /* this ignores size changes */ - setattr_copy(inode, iattr); - -out: - mutex_unlock(&sysfs_mutex); - return error; -} - -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, - u32 *secdata_len) -{ - struct sysfs_inode_attrs *iattrs; - void *old_secdata; - size_t old_secdata_len; - - if (!sd->s_iattr) { - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - } - - iattrs = sd->s_iattr; - old_secdata = iattrs->ia_secdata; - old_secdata_len = iattrs->ia_secdata_len; - - iattrs->ia_secdata = *secdata; - iattrs->ia_secdata_len = *secdata_len; - - *secdata = old_secdata; - *secdata_len = old_secdata_len; - return 0; -} - -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - void *secdata; - int error; - u32 secdata_len = 0; - - if (!sd) - return -EINVAL; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, - value, size, flags); - if (error) - goto out; - error = security_inode_getsecctx(dentry->d_inode, - &secdata, &secdata_len); - if (error) - goto out; - - mutex_lock(&sysfs_mutex); - error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); - mutex_unlock(&sysfs_mutex); - - if (secdata) - security_release_secctx(secdata, secdata_len); - } else - return -EINVAL; -out: - return error; -} - -static inline void set_default_inode_attr(struct inode *inode, umode_t mode) -{ - inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -} - -static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) -{ - inode->i_uid = iattr->ia_uid; - inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; -} - -static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct sysfs_inode_attrs *iattrs = sd->s_iattr; - - inode->i_mode = sd->s_mode; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them from persistent copy in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } - - if (sysfs_type(sd) == SYSFS_DIR) - set_nlink(inode, sd->s_dir.subdirs + 2); -} - -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - generic_fillattr(inode, stat); - return 0; -} - -static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct bin_attribute *bin_attr; - - inode->i_private = sysfs_get(sd); - inode->i_mapping->a_ops = &sysfs_aops; - inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; - inode->i_op = &sysfs_inode_operations; - - set_default_inode_attr(inode, sd->s_mode); - sysfs_refresh_inode(sd, inode); - - /* initialize inode according to type */ - switch (sysfs_type(sd)) { - case SYSFS_DIR: - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - break; - case SYSFS_KOBJ_ATTR: - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - break; - case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_attr.bin_attr; - inode->i_size = bin_attr->size; - inode->i_fop = &sysfs_bin_operations; - break; - case SYSFS_KOBJ_LINK: - inode->i_op = &sysfs_symlink_inode_operations; - break; - default: - BUG(); - } - - unlock_new_inode(inode); -} - -/** - * sysfs_get_inode - get inode for sysfs_dirent - * @sb: super block - * @sd: sysfs_dirent to allocate inode for - * - * Get inode for @sd. If such inode doesn't exist, a new inode - * is allocated and basics are initialized. New inode is - * returned locked. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * Pointer to allocated inode on success, NULL on failure. - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) -{ - struct inode *inode; - - inode = iget_locked(sb, sd->s_ino); - if (inode && (inode->i_state & I_NEW)) - sysfs_init_inode(sd, inode); - - return inode; -} - -/* - * The sysfs_dirent serves as both an inode and a directory entry for sysfs. - * To prevent the sysfs inode numbers from being freed prematurely we take a - * reference to sysfs_dirent from the sysfs inode. A - * super_operations.evict_inode() implementation is needed to drop that - * reference upon inode destruction. - */ -void sysfs_evict_inode(struct inode *inode) -{ - struct sysfs_dirent *sd = inode->i_private; - - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - sysfs_put(sd); -} - -int sysfs_permission(struct inode *inode, int mask) -{ - struct sysfs_dirent *sd; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sd = inode->i_private; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - return generic_permission(inode, mask); -} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 834ec2c..6211230 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -14,146 +14,41 @@ #include <linux/fs.h> #include <linux/mount.h> -#include <linux/pagemap.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/magic.h> -#include <linux/slab.h> #include <linux/user_namespace.h> #include "sysfs.h" - -static struct vfsmount *sysfs_mnt; -struct kmem_cache *sysfs_dir_cachep; - -static const struct super_operations sysfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .evict_inode = sysfs_evict_inode, -}; - -struct sysfs_dirent sysfs_root = { - .s_name = "", - .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), - .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, - .s_ino = 1, -}; - -static int sysfs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; - sb->s_op = &sysfs_ops; - sb->s_time_gran = 1; - - /* get root inode, initialize and unlock it */ - mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(sb, &sysfs_root); - mutex_unlock(&sysfs_mutex); - if (!inode) { - pr_debug("sysfs: could not get root inode\n"); - return -ENOMEM; - } - - /* instantiate and link root dentry */ - root = d_make_root(inode); - if (!root) { - pr_debug("%s: could not get root dentry!\n", __func__); - return -ENOMEM; - } - root->d_fsdata = &sysfs_root; - sb->s_root = root; - sb->s_d_op = &sysfs_dentry_ops; - return 0; -} - -static int sysfs_test_super(struct super_block *sb, void *data) -{ - struct sysfs_super_info *sb_info = sysfs_info(sb); - struct sysfs_super_info *info = data; - enum kobj_ns_type type; - int found = 1; - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (sb_info->ns[type] != info->ns[type]) - found = 0; - } - return found; -} - -static int sysfs_set_super(struct super_block *sb, void *data) -{ - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; -} - -static void free_sysfs_super_info(struct sysfs_super_info *info) -{ - int type; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - kobj_ns_drop(type, info->ns[type]); - kfree(info); -} +static struct kernfs_root *sysfs_root; +struct kernfs_node *sysfs_root_kn; static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct sysfs_super_info *info; - enum kobj_ns_type type; - struct super_block *sb; - int error; + struct dentry *root; + void *ns; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (!kobj_ns_current_may_mount(type)) - return ERR_PTR(-EPERM); - } - } - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_grab_current(type); - - sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - free_sysfs_super_info(info); - if (IS_ERR(sb)) - return ERR_CAST(sb); - if (!sb->s_root) { - error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(sb); - return ERR_PTR(error); - } - sb->s_flags |= MS_ACTIVE; + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); } - return dget(sb->s_root); + ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns); + if (IS_ERR(root)) + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + return root; } static void sysfs_kill_sb(struct super_block *sb) { - struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances - * so we can't find it, before freeing sysfs_super_info. - */ - kill_anon_super(sb); - free_sysfs_super_info(info); + void *ns = (void *)kernfs_super_ns(sb); + + kernfs_kill_sb(sb); + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { @@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = { int __init sysfs_init(void) { - int err = -ENOMEM; + int err; - sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", - sizeof(struct sysfs_dirent), - 0, 0, NULL); - if (!sysfs_dir_cachep) - goto out; + sysfs_root = kernfs_create_root(NULL, NULL); + if (IS_ERR(sysfs_root)) + return PTR_ERR(sysfs_root); - err = sysfs_inode_init(); - if (err) - goto out_err; + sysfs_root_kn = sysfs_root->kn; err = register_filesystem(&sysfs_fs_type); - if (!err) { - sysfs_mnt = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mnt)) { - printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mnt); - sysfs_mnt = NULL; - unregister_filesystem(&sysfs_fs_type); - goto out_err; - } - } else - goto out_err; -out: - return err; -out_err: - kmem_cache_destroy(sysfs_dir_cachep); - sysfs_dir_cachep = NULL; - goto out; -} - -#undef sysfs_get -struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) -{ - return __sysfs_get(sd); -} -EXPORT_SYMBOL_GPL(sysfs_get); + if (err) { + kernfs_destroy_root(sysfs_root); + return err; + } -#undef sysfs_put -void sysfs_put(struct sysfs_dirent *sd) -{ - __sysfs_put(sd); + return 0; } -EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3ae3f1b..aecb15f 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -11,109 +11,73 @@ */ #include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" -static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, - struct kobject *target, +static int sysfs_do_create_link_sd(struct kernfs_node *parent, + struct kobject *target_kobj, const char *name, int warn) { - struct sysfs_dirent *target_sd = NULL; - struct sysfs_dirent *sd = NULL; - struct sysfs_addrm_cxt acxt; - enum kobj_ns_type ns_type; - int error; + struct kernfs_node *kn, *target = NULL; - BUG_ON(!name || !parent_sd); + BUG_ON(!name || !parent); /* - * We don't own @target and it may be removed at any time. + * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (target->sd) - target_sd = sysfs_get(target->sd); + if (target_kobj->sd) { + target = target_kobj->sd; + kernfs_get(target); + } spin_unlock(&sysfs_symlink_target_lock); - error = -ENOENT; - if (!target_sd) - goto out_put; - - error = -ENOMEM; - sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!sd) - goto out_put; + if (!target) + return -ENOENT; - ns_type = sysfs_ns_type(parent_sd); - if (ns_type) - sd->s_ns = target_sd->s_ns; - sd->s_symlink.target_sd = target_sd; - target_sd = NULL; /* reference is now owned by the symlink */ - - sysfs_addrm_start(&acxt); - /* Symlinks must be between directories with the same ns_type */ - if (!ns_type || - (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { - if (warn) - error = sysfs_add_one(&acxt, sd, parent_sd); - else - error = __sysfs_add_one(&acxt, sd, parent_sd); - } else { - error = -EINVAL; - WARN(1, KERN_WARNING - "sysfs: symlink across ns_types %s/%s -> %s/%s\n", - parent_sd->s_name, - sd->s_name, - sd->s_symlink.target_sd->s_parent->s_name, - sd->s_symlink.target_sd->s_name); - } - sysfs_addrm_finish(&acxt); + kn = kernfs_create_link(parent, name, target); + kernfs_put(target); - if (error) - goto out_put; + if (!IS_ERR(kn)) + return 0; - return 0; - - out_put: - sysfs_put(target_sd); - sysfs_put(sd); - return error; + if (warn && PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. - * @sd: directory we're creating the link in. + * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { - return sysfs_do_create_link_sd(sd, target, name, 1); + return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - if (!parent_sd) + if (!parent) return -EFAULT; - return sysfs_do_create_link_sd(parent_sd, target, name, warn); + return sysfs_do_create_link_sd(parent, target, name, warn); } /** @@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (targ->sd && sysfs_ns_type(kobj->sd)) - ns = targ->sd->s_ns; + if (targ->sd && kernfs_ns_enabled(kobj->sd)) + ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); - sysfs_hash_and_remove(kobj->sd, name, ns); + kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** @@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, */ void sysfs_remove_link(struct kobject *kobj, const char *name) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - sysfs_hash_and_remove(parent_sd, name, NULL); + kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); @@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { - struct sysfs_dirent *parent_sd, *sd = NULL; + struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; if (targ->sd) - old_ns = targ->sd->s_ns; + old_ns = targ->sd->ns; result = -ENOENT; - sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); - if (!sd) + kn = kernfs_find_and_get_ns(parent, old, old_ns); + if (!kn) goto out; result = -EINVAL; - if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + if (kernfs_type(kn) != KERNFS_LINK) goto out; - if (sd->s_symlink.target_sd->s_dir.kobj != targ) + if (kn->symlink.target_kn->priv != targ) goto out; - result = sysfs_rename(sd, parent_sd, new, new_ns); + result = kernfs_rename_ns(kn, parent, new, new_ns); out: - sysfs_put(sd); + kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); - -static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, - struct sysfs_dirent *target_sd, char *path) -{ - struct sysfs_dirent *base, *sd; - char *s = path; - int len = 0; - - /* go up to the root, stop at the base */ - base = parent_sd; - while (base->s_parent) { - sd = target_sd->s_parent; - while (sd->s_parent && base != sd) - sd = sd->s_parent; - - if (base == sd) - break; - - strcpy(s, "../"); - s += 3; - base = base->s_parent; - } - - /* determine end of target string for reverse fillup */ - sd = target_sd; - while (sd->s_parent && sd != base) { - len += strlen(sd->s_name) + 1; - sd = sd->s_parent; - } - - /* check limits */ - if (len < 2) - return -EINVAL; - len--; - if ((s - path) + len > PATH_MAX) - return -ENAMETOOLONG; - - /* reverse fillup of target string from target to base */ - sd = target_sd; - while (sd->s_parent && sd != base) { - int slen = strlen(sd->s_name); - - len -= slen; - strncpy(s + len, sd->s_name, slen); - if (len) - s[--len] = '/'; - - sd = sd->s_parent; - } - - return 0; -} - -static int sysfs_getlink(struct dentry *dentry, char *path) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent *target_sd = sd->s_symlink.target_sd; - int error; - - mutex_lock(&sysfs_mutex); - error = sysfs_get_target_path(parent_sd, target_sd, path); - mutex_unlock(&sysfs_mutex); - - return error; -} - -static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = sysfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); -} - -const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .permission = sysfs_permission, -}; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 0af09fb..0e2f1cc 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,248 +8,36 @@ * This file is released under the GPLv2. */ -#include <linux/lockdep.h> -#include <linux/kobject_ns.h> -#include <linux/fs.h> -#include <linux/rbtree.h> +#ifndef __SYSFS_INTERNAL_H +#define __SYSFS_INTERNAL_H -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - - unsigned long subdirs; - /* children rbtree starts here and goes through sd->s_rb */ - struct rb_root children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - union { - struct attribute *attr; - struct bin_attribute *bin_attr; - }; - struct sysfs_open_dirent *open; -}; - -struct sysfs_inode_attrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - struct sysfs_dirent *s_parent; - const char *s_name; - - struct rb_node s_rb; - - union { - struct completion *completion; - struct sysfs_dirent *removed_list; - } u; - - const void *s_ns; /* namespace tag */ - unsigned int s_hash; /* ns + name hash */ - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - }; - - unsigned short s_flags; - umode_t s_mode; - unsigned int s_ino; - struct sysfs_inode_attrs *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) -#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) - -/* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xf00 -#define SYSFS_NS_TYPE_SHIFT 8 - -#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED 0x02000 - -static inline unsigned int sysfs_type(struct sysfs_dirent *sd) -{ - return sd->s_flags & SYSFS_TYPE_MASK; -} - -/* - * Return any namespace tags on this dirent. - * enum kobj_ns_type is defined in linux/kobject.h - */ -static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) -{ - return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -#define sysfs_dirent_init_lockdep(sd) \ -do { \ - struct attribute *attr = sd->s_attr.attr; \ - struct lock_class_key *key = attr->key; \ - if (!key) \ - key = &attr->skey; \ - \ - lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while (0) - -/* Test for attributes that want to ignore lockdep for read-locking */ -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - int type = sysfs_type(sd); - - return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) && - sd->s_attr.attr->ignore_lockdep; -} - -#else - -#define sysfs_dirent_init_lockdep(sd) do {} while (0) - -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - return true; -} - -#endif - -/* - * Context structure to be used while adding/removing nodes. - */ -struct sysfs_addrm_cxt { - struct sysfs_dirent *removed; -}; +#include <linux/sysfs.h> /* * mount.c */ - -/* - * Each sb is associated with a set of namespace tags (i.e. - * the network namespace of the task which mounted this sysfs - * instance). - */ -struct sysfs_super_info { - void *ns[KOBJ_NS_TYPES]; -}; -#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) -extern struct sysfs_dirent sysfs_root; -extern struct kmem_cache *sysfs_dir_cachep; +extern struct kernfs_node *sysfs_root_kn; /* * dir.c */ -extern struct mutex sysfs_mutex; extern spinlock_t sysfs_symlink_target_lock; -extern const struct dentry_operations sysfs_dentry_ops; - -extern const struct file_operations sysfs_dir_operations; -extern const struct inode_operations sysfs_dir_inode_operations; -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); -void sysfs_put_active(struct sysfs_dirent *sd); -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt); -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name); -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -void sysfs_remove(struct sysfs_dirent *sd); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns); -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); - -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns); -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); - -void release_sysfs_dirent(struct sysfs_dirent *sd); - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd); - -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns); - -static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) -{ - if (sd) { - WARN_ON(!atomic_read(&sd->s_count)); - atomic_inc(&sd->s_count); - } - return sd; -} -#define sysfs_get(sd) __sysfs_get(sd) - -static inline void __sysfs_put(struct sysfs_dirent *sd) -{ - if (sd && atomic_dec_and_test(&sd->s_count)) - release_sysfs_dirent(sd); -} -#define sysfs_put(sd) __sysfs_put(sd) - -/* - * inode.c - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); -void sysfs_evict_inode(struct inode *inode); -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_inode_init(void); +void sysfs_warn_dup(struct kernfs_node *parent, const char *name); /* * file.c */ -extern const struct file_operations sysfs_file_operations; -extern const struct file_operations sysfs_bin_operations; - -int sysfs_add_file(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type); - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, +int sysfs_add_file(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin); +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, umode_t amode, const void *ns); -void sysfs_unmap_bin_file(struct sysfs_dirent *sd); /* * symlink.c */ -extern const struct inode_operations sysfs_symlink_inode_operations; -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name); + +#endif /* __SYSFS_INTERNAL_H */ diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 739e0a52..5549d69 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -110,7 +110,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1394106..82e0dab 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -287,6 +287,7 @@ xfs_bmapi_allocate( INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); + destroy_work_on_stack(&args->work); return args->result; } |