From 74d92abc6143b124db03f0d341f02bde72fba6f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Apr 2008 19:46:10 -0400 Subject: [PATCH] fix file and descriptor handling in perfmon Races galore... General rule: as soon as it's in descriptor table, it's over; another thread might have started IO on it/dup2() it elsewhere/dup2() something *over* it/etc. fd_install() is the very last step one should take - it's a point of no return. Besides, the damn thing leaked on failure exits... Signed-off-by: Al Viro diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 7fbb51e..c1ad27d 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -867,7 +867,7 @@ pfm_rvfree(void *mem, unsigned long size) } static pfm_context_t * -pfm_context_alloc(void) +pfm_context_alloc(int ctx_flags) { pfm_context_t *ctx; @@ -878,6 +878,46 @@ pfm_context_alloc(void) ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL); if (ctx) { DPRINT(("alloc ctx @%p\n", ctx)); + + /* + * init context protection lock + */ + spin_lock_init(&ctx->ctx_lock); + + /* + * context is unloaded + */ + ctx->ctx_state = PFM_CTX_UNLOADED; + + /* + * initialization of context's flags + */ + ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; + ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; + ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; + /* + * will move to set properties + * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; + */ + + /* + * init restart semaphore to locked + */ + init_completion(&ctx->ctx_restart_done); + + /* + * activation is used in SMP only + */ + ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; + SET_LAST_CPU(ctx, -1); + + /* + * initialize notification message queue + */ + ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; + init_waitqueue_head(&ctx->ctx_msgq_wait); + init_waitqueue_head(&ctx->ctx_zombieq); + } return ctx; } @@ -2165,28 +2205,21 @@ static struct dentry_operations pfmfs_dentry_operations = { }; -static int -pfm_alloc_fd(struct file **cfile) +static struct file * +pfm_alloc_file(pfm_context_t *ctx) { - int fd, ret = 0; - struct file *file = NULL; - struct inode * inode; + struct file *file; + struct inode *inode; + struct dentry *dentry; char name[32]; struct qstr this; - fd = get_unused_fd(); - if (fd < 0) return -ENFILE; - - ret = -ENFILE; - - file = get_empty_filp(); - if (!file) goto out; - /* * allocate a new inode */ inode = new_inode(pfmfs_mnt->mnt_sb); - if (!inode) goto out; + if (!inode) + return ERR_PTR(-ENOMEM); DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode)); @@ -2199,59 +2232,28 @@ pfm_alloc_fd(struct file **cfile) this.len = strlen(name); this.hash = inode->i_ino; - ret = -ENOMEM; - /* * allocate a new dcache entry */ - file->f_path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!file->f_path.dentry) goto out; + dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } - file->f_path.dentry->d_op = &pfmfs_dentry_operations; + dentry->d_op = &pfmfs_dentry_operations; + d_add(dentry, inode); - d_add(file->f_path.dentry, inode); - file->f_path.mnt = mntget(pfmfs_mnt); - file->f_mapping = inode->i_mapping; + file = alloc_file(pfmfs_mnt, dentry, FMODE_READ, &pfm_file_ops); + if (!file) { + dput(dentry); + return ERR_PTR(-ENFILE); + } - file->f_op = &pfm_file_ops; - file->f_mode = FMODE_READ; file->f_flags = O_RDONLY; - file->f_pos = 0; - - /* - * may have to delay until context is attached? - */ - fd_install(fd, file); - - /* - * the file structure we will use - */ - *cfile = file; - - return fd; -out: - if (file) put_filp(file); - put_unused_fd(fd); - return ret; -} - -static void -pfm_free_fd(int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; + file->private_data = ctx; - /* - * there ie no fd_uninstall(), so we do it here - */ - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - rcu_assign_pointer(fdt->fd[fd], NULL); - spin_unlock(&files->file_lock); - - if (file) - put_filp(file); - put_unused_fd(fd); + return file; } static int @@ -2475,6 +2477,7 @@ pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t /* link buffer format and context */ ctx->ctx_buf_fmt = fmt; + ctx->ctx_fl_is_sampling = 1; /* assume record() is defined */ /* * check if buffer format wants to use perfmon buffer allocation/mapping service @@ -2669,78 +2672,45 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg { pfarg_context_t *req = (pfarg_context_t *)arg; struct file *filp; + struct path path; int ctx_flags; + int fd; int ret; /* let's check the arguments first */ ret = pfarg_is_sane(current, req); - if (ret < 0) return ret; + if (ret < 0) + return ret; ctx_flags = req->ctx_flags; ret = -ENOMEM; - ctx = pfm_context_alloc(); - if (!ctx) goto error; + fd = get_unused_fd(); + if (fd < 0) + return fd; - ret = pfm_alloc_fd(&filp); - if (ret < 0) goto error_file; + ctx = pfm_context_alloc(ctx_flags); + if (!ctx) + goto error; - req->ctx_fd = ctx->ctx_fd = ret; + filp = pfm_alloc_file(ctx); + if (IS_ERR(filp)) { + ret = PTR_ERR(filp); + goto error_file; + } - /* - * attach context to file - */ - filp->private_data = ctx; + req->ctx_fd = ctx->ctx_fd = fd; /* * does the user want to sample? */ if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) { ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req); - if (ret) goto buffer_error; + if (ret) + goto buffer_error; } - /* - * init context protection lock - */ - spin_lock_init(&ctx->ctx_lock); - - /* - * context is unloaded - */ - ctx->ctx_state = PFM_CTX_UNLOADED; - - /* - * initialization of context's flags - */ - ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; - ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; - ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */ - ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; - /* - * will move to set properties - * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; - */ - - /* - * init restart semaphore to locked - */ - init_completion(&ctx->ctx_restart_done); - - /* - * activation is used in SMP only - */ - ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; - SET_LAST_CPU(ctx, -1); - - /* - * initialize notification message queue - */ - ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; - init_waitqueue_head(&ctx->ctx_msgq_wait); - init_waitqueue_head(&ctx->ctx_zombieq); - DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n", ctx, ctx_flags, @@ -2755,10 +2725,14 @@ pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg */ pfm_reset_pmu_state(ctx); + fd_install(fd, filp); + return 0; buffer_error: - pfm_free_fd(ctx->ctx_fd, filp); + path = filp->f_path; + put_filp(filp); + path_put(&path); if (ctx->ctx_buf_fmt) { pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs); @@ -2767,6 +2741,7 @@ error_file: pfm_context_free(ctx); error: + put_unused_fd(fd); return ret; } -- cgit v0.10.2 From bf7da7bcfb38409b4cdea34b0905bdf344f1b36d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Apr 2008 00:49:25 -0400 Subject: [PATCH] remove horrors with irix tty ioctls handling Existing code in there (get_tty(), etc.) is both severely racy *and* pointless: ioctls in question have Linux equivalents and there's no need to play silly buggers in irix_ioctl() - just need to replace arguments and, in case of TIOCGSID, deal with API differences - Linux one expects pid_t __user * while Irix one does unsigned long __user *. BFD... Signed-off-by: Al Viro diff --git a/arch/mips/kernel/irixioctl.c b/arch/mips/kernel/irixioctl.c index 2bde200d..b39bdba 100644 --- a/arch/mips/kernel/irixioctl.c +++ b/arch/mips/kernel/irixioctl.c @@ -27,33 +27,6 @@ struct irix_termios { cc_t c_cc[NCCS]; }; -extern void start_tty(struct tty_struct *tty); -static struct tty_struct *get_tty(int fd) -{ - struct file *filp; - struct tty_struct *ttyp = NULL; - - rcu_read_lock(); - filp = fcheck(fd); - if(filp && filp->private_data) { - ttyp = (struct tty_struct *) filp->private_data; - - if(ttyp->magic != TTY_MAGIC) - ttyp =NULL; - } - rcu_read_unlock(); - return ttyp; -} - -static struct tty_struct *get_real_tty(struct tty_struct *tp) -{ - if (tp->driver->type == TTY_DRIVER_TYPE_PTY && - tp->driver->subtype == PTY_TYPE_MASTER) - return tp->link; - else - return tp; -} - asmlinkage int irix_ioctl(int fd, unsigned long cmd, unsigned long arg) { struct tty_struct *tp, *rtp; @@ -146,34 +119,24 @@ asmlinkage int irix_ioctl(int fd, unsigned long cmd, unsigned long arg) error = sys_ioctl(fd, TIOCNOTTY, arg); break; - case 0x00007416: + case 0x00007416: { + pid_t pid; #ifdef DEBUG_IOCTLS printk("TIOCGSID, %08lx) ", arg); #endif - tp = get_tty(fd); - if(!tp) { - error = -EINVAL; - break; - } - rtp = get_real_tty(tp); -#ifdef DEBUG_IOCTLS - printk("rtp->session=%d ", rtp->session); -#endif - error = put_user(rtp->session, (unsigned long __user *) arg); + old_fs = get_fs(); set_fs(get_ds()); + error = sys_ioctl(fd, TIOCGSID, (unsigned long)&pid); + set_fs(old_fs); + if (!error) + error = put_user(pid, (unsigned long __user *) arg); break; - + } case 0x746e: /* TIOCSTART, same effect as hitting ^Q */ #ifdef DEBUG_IOCTLS printk("TIOCSTART, %08lx) ", arg); #endif - tp = get_tty(fd); - if(!tp) { - error = -EINVAL; - break; - } - rtp = get_real_tty(tp); - start_tty(rtp); + error = sys_ioctl(fd, TCXONC, TCOON); break; case 0x20006968: -- cgit v0.10.2 From a2dcb44c3c5a8151d2d9f6ac8ad0789efcdbe184 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Apr 2008 14:05:15 -0400 Subject: [PATCH] make osf_select() use core_sys_select() ... instead of open-coding it Signed-off-by: Al Viro diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 9fee37e..32ca1b9 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -981,27 +981,18 @@ asmlinkage int osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval32 __user *tvp) { - fd_set_bits fds; - char *bits; - size_t size; - long timeout; - int ret = -EINVAL; - struct fdtable *fdt; - int max_fds; - - timeout = MAX_SCHEDULE_TIMEOUT; + s64 timeout = MAX_SCHEDULE_TIMEOUT; if (tvp) { time_t sec, usec; if (!access_ok(VERIFY_READ, tvp, sizeof(*tvp)) || __get_user(sec, &tvp->tv_sec) || __get_user(usec, &tvp->tv_usec)) { - ret = -EFAULT; - goto out_nofds; + return -EFAULT; } if (sec < 0 || usec < 0) - goto out_nofds; + return -EINVAL; if ((unsigned long) sec < MAX_SELECT_SECONDS) { timeout = (usec + 1000000/HZ - 1) / (1000000/HZ); @@ -1009,60 +1000,8 @@ osf_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, } } - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); - if (n < 0 || n > max_fds) - goto out_nofds; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - ret = -ENOMEM; - size = FDS_BYTES(n); - bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = get_fd_set(n, inp->fds_bits, fds.in)) || - (ret = get_fd_set(n, outp->fds_bits, fds.out)) || - (ret = get_fd_set(n, exp->fds_bits, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, &timeout); - /* OSF does not copy back the remaining time. */ - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - if (set_fd_set(n, inp->fds_bits, fds.res_in) || - set_fd_set(n, outp->fds_bits, fds.res_out) || - set_fd_set(n, exp->fds_bits, fds.res_ex)) - ret = -EFAULT; - - out: - kfree(bits); - out_nofds: - return ret; + return core_sys_select(n, inp, outp, exp, &timeout); } struct rusage32 { diff --git a/fs/select.c b/fs/select.c index 2c29214..80d7038 100644 --- a/fs/select.c +++ b/fs/select.c @@ -298,7 +298,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) #define MAX_SELECT_SECONDS \ ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, +int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; diff --git a/include/linux/poll.h b/include/linux/poll.h index 16d813b..ef45382 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -117,6 +117,8 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset) extern int do_select(int n, fd_set_bits *fds, s64 *timeout); extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, s64 *timeout); +extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, s64 *timeout); #endif /* KERNEL */ -- cgit v0.10.2 From 9f3acc3140444a900ab280de942291959f0f615d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Apr 2008 07:44:08 -0400 Subject: [PATCH] split linux/file.h Initial splitoff of the low-level stuff; taken to fdtable.h Signed-off-by: Al Viro diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c index 998c4ef..ceb62dc 100644 --- a/arch/mips/kernel/kspd.c +++ b/arch/mips/kernel/kspd.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index b962c3a..af116aa 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c index 6342b05..3582f43 100644 --- a/drivers/char/tty_audit.c +++ b/drivers/char/tty_audit.c @@ -11,6 +11,7 @@ #include #include +#include #include struct tty_audit_buf { diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 1d298c2..49c1a22 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/compat.c b/fs/compat.c index 139dc93..332a869 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/dnotify.c b/fs/dnotify.c index eaecc4c..676073b 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include int dir_notify_enable __read_mostly = 1; diff --git a/fs/exec.c b/fs/exec.c index 9f9f931..aeaa979 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include diff --git a/fs/fcntl.c b/fs/fcntl.c index 3f3ac63..bfd7765 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/file.c b/fs/file.c index 5110acb..f6fbcb4 100644 --- a/fs/file.c +++ b/fs/file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/file_table.c b/fs/file_table.c index 7a0a9b8..8308422 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/locks.c b/fs/locks.c index 44d9a6a..663c069 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -116,6 +116,7 @@ #include #include +#include #include #include #include diff --git a/fs/open.c b/fs/open.c index 7af1f05..a145008 100644 --- a/fs/open.c +++ b/fs/open.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/proc/array.c b/fs/proc/array.c index c135cbd..dca997a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/proc/base.c b/fs/proc/base.c index fcf02f2..808cbdc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/select.c b/fs/select.c index 80d7038..8dda969 100644 --- a/fs/select.c +++ b/fs/select.c @@ -21,6 +21,7 @@ #include #include /* for STICKY_TIMEOUTS */ #include +#include #include #include diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h new file mode 100644 index 0000000..a118f3c --- /dev/null +++ b/include/linux/fdtable.h @@ -0,0 +1,99 @@ +/* + * descriptor table internals; you almost certainly want file.h instead. + */ + +#ifndef __LINUX_FDTABLE_H +#define __LINUX_FDTABLE_H + +#include +#include +#include +#include +#include +#include + +/* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ +#define NR_OPEN_DEFAULT BITS_PER_LONG + +/* + * The embedded_fd_set is a small fd_set, + * suitable for most tasks (which open <= BITS_PER_LONG files) + */ +struct embedded_fd_set { + unsigned long fds_bits[1]; +}; + +struct fdtable { + unsigned int max_fds; + struct file ** fd; /* current fd array */ + fd_set *close_on_exec; + fd_set *open_fds; + struct rcu_head rcu; + struct fdtable *next; +}; + +/* + * Open file table structure + */ +struct files_struct { + /* + * read mostly part + */ + atomic_t count; + struct fdtable *fdt; + struct fdtable fdtab; + /* + * written part on a separate cache line in SMP + */ + spinlock_t file_lock ____cacheline_aligned_in_smp; + int next_fd; + struct embedded_fd_set close_on_exec_init; + struct embedded_fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; +}; + +#define files_fdtable(files) (rcu_dereference((files)->fdt)) + +extern struct kmem_cache *filp_cachep; + +struct file_operations; +struct vfsmount; +struct dentry; + +extern int expand_files(struct files_struct *, int nr); +extern void free_fdtable_rcu(struct rcu_head *rcu); +extern void __init files_defer_init(void); + +static inline void free_fdtable(struct fdtable *fdt) +{ + call_rcu(&fdt->rcu, free_fdtable_rcu); +} + +static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) +{ + struct file * file = NULL; + struct fdtable *fdt = files_fdtable(files); + + if (fd < fdt->max_fds) + file = rcu_dereference(fdt->fd[fd]); + return file; +} + +/* + * Check whether the specified fd has an open file. + */ +#define fcheck(fd) fcheck_files(current->files, fd) + +struct task_struct; + +struct files_struct *get_files_struct(struct task_struct *); +void put_files_struct(struct files_struct *fs); +void reset_files_struct(struct files_struct *); +int unshare_files(struct files_struct **); + +extern struct kmem_cache *files_cachep; + +#endif /* __LINUX_FDTABLE_H */ diff --git a/include/linux/file.h b/include/linux/file.h index 69baf5a4..27c64bd 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -5,59 +5,11 @@ #ifndef __LINUX_FILE_H #define __LINUX_FILE_H -#include -#include #include -#include -#include #include +#include -/* - * The default fd array needs to be at least BITS_PER_LONG, - * as this is the granularity returned by copy_fdset(). - */ -#define NR_OPEN_DEFAULT BITS_PER_LONG - -/* - * The embedded_fd_set is a small fd_set, - * suitable for most tasks (which open <= BITS_PER_LONG files) - */ -struct embedded_fd_set { - unsigned long fds_bits[1]; -}; - -struct fdtable { - unsigned int max_fds; - struct file ** fd; /* current fd array */ - fd_set *close_on_exec; - fd_set *open_fds; - struct rcu_head rcu; - struct fdtable *next; -}; - -/* - * Open file table structure - */ -struct files_struct { - /* - * read mostly part - */ - atomic_t count; - struct fdtable *fdt; - struct fdtable fdtab; - /* - * written part on a separate cache line in SMP - */ - spinlock_t file_lock ____cacheline_aligned_in_smp; - int next_fd; - struct embedded_fd_set close_on_exec_init; - struct embedded_fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; -}; - -#define files_fdtable(files) (rcu_dereference((files)->fdt)) - -extern struct kmem_cache *filp_cachep; +struct file; extern void __fput(struct file *); extern void fput(struct file *); @@ -85,41 +37,7 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern int get_unused_fd_flags(int flags); extern void put_unused_fd(unsigned int fd); -struct kmem_cache; - -extern int expand_files(struct files_struct *, int nr); -extern void free_fdtable_rcu(struct rcu_head *rcu); -extern void __init files_defer_init(void); - -static inline void free_fdtable(struct fdtable *fdt) -{ - call_rcu(&fdt->rcu, free_fdtable_rcu); -} - -static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd) -{ - struct file * file = NULL; - struct fdtable *fdt = files_fdtable(files); - - if (fd < fdt->max_fds) - file = rcu_dereference(fdt->fd[fd]); - return file; -} - -/* - * Check whether the specified fd has an open file. - */ -#define fcheck(fd) fcheck_files(current->files, fd) extern void fd_install(unsigned int fd, struct file *file); -struct task_struct; - -struct files_struct *get_files_struct(struct task_struct *); -void put_files_struct(struct files_struct *fs); -void reset_files_struct(struct files_struct *); -int unshare_files(struct files_struct **); - -extern struct kmem_cache *files_cachep; - #endif /* __LINUX_FILE_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bf6b8a6..b24c287 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -1,7 +1,7 @@ #ifndef _LINUX__INIT_TASK_H #define _LINUX__INIT_TASK_H -#include +#include #include #include #include diff --git a/kernel/exit.c b/kernel/exit.c index d3ad546..1510f78 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 2bb675a..933e60e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/kmod.c b/kernel/kmod.c index e276404..8df97d3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1b50a6e..1c864c0 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From 2030a42cecd4dd1985a2ab03e25f3cd6106a5ca8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Feb 2008 06:46:49 -0500 Subject: [PATCH] sanitize anon_inode_getfd() a) none of the callers even looks at inode or file returned by anon_inode_getfd() b) any caller that would try to look at those would be racy, since by the time it returns we might have raced with close() from another thread and that file would be pining for fjords. Signed-off-by: Al Viro diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index f42be06..977ef20 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -57,9 +57,6 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * anonymous inode, and a dentry that describe the "class" * of the file * - * @pfd: [out] pointer to the file descriptor - * @dpinode: [out] pointer to the inode - * @pfile: [out] pointer to the file struct * @name: [in] name of the "class" of the new file * @fops [in] file operations for the new file * @priv [in] private data for the new file (will be file's private_data) @@ -68,10 +65,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfd() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry - * setup. + * setup. Returns new descriptor or -error. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, - const char *name, const struct file_operations *fops, +int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv) { struct qstr this; @@ -125,10 +121,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, fd_install(fd, file); - *pfd = fd; - *pinode = anon_inode_inode; - *pfile = file; - return 0; + return fd; err_dput: dput(dentry); diff --git a/fs/eventfd.c b/fs/eventfd.c index a9f130c..343942d 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -200,10 +200,8 @@ struct file *eventfd_fget(int fd) asmlinkage long sys_eventfd(unsigned int count) { - int error, fd; + int fd; struct eventfd_ctx *ctx; - struct file *file; - struct inode *inode; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -216,12 +214,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]", - &eventfd_fops, ctx); - if (!error) - return fd; - - kfree(ctx); - return error; + fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx); + if (fd < 0) + kfree(ctx); + return fd; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 221086f..990c01d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1050,8 +1050,6 @@ asmlinkage long sys_epoll_create(int size) { int error, fd = -1; struct eventpoll *ep; - struct inode *inode; - struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); @@ -1061,29 +1059,24 @@ asmlinkage long sys_epoll_create(int size) * structure ( "struct eventpoll" ). */ error = -EINVAL; - if (size <= 0 || (error = ep_alloc(&ep)) != 0) + if (size <= 0 || (error = ep_alloc(&ep)) < 0) { + fd = error; goto error_return; + } /* * Creates all the items needed to setup an eventpoll file. That is, - * a file structure, and inode and a free file descriptor. + * a file structure and a free file descriptor. */ - error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", - &eventpoll_fops, ep); - if (error) - goto error_free; + fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep); + if (fd < 0) + ep_free(ep); +error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; - -error_free: - ep_free(ep); -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, size, error)); - return error; } /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 8ead0db..6197256 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -207,11 +207,8 @@ static const struct file_operations signalfd_fops = { asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask) { - int error; sigset_t sigmask; struct signalfd_ctx *ctx; - struct file *file; - struct inode *inode; if (sizemask != sizeof(sigset_t) || copy_from_user(&sigmask, user_mask, sizeof(sigmask))) @@ -230,12 +227,11 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(&ufd, &inode, &file, "[signalfd]", - &signalfd_fops, ctx); - if (error) - goto err_fdalloc; + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx); + if (ufd < 0) + kfree(ctx); } else { - file = fget(ufd); + struct file *file = fget(ufd); if (!file) return -EBADF; ctx = file->private_data; @@ -252,9 +248,4 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas } return ufd; - -err_fdalloc: - kfree(ctx); - return error; } - diff --git a/fs/timerfd.c b/fs/timerfd.c index 5400524..d87d354 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -181,10 +181,8 @@ static struct file *timerfd_fget(int fd) asmlinkage long sys_timerfd_create(int clockid, int flags) { - int error, ufd; + int ufd; struct timerfd_ctx *ctx; - struct file *file; - struct inode *inode; if (flags) return -EINVAL; @@ -200,12 +198,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); - error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", - &timerfd_fops, ctx); - if (error) { + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx); + if (ufd < 0) kfree(ctx); - return error; - } return ufd; } diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index b2e1ba3..6129e58 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -8,8 +8,7 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, - const char *name, const struct file_operations *fops, +int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv); #endif /* _LINUX_ANON_INODES_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c82cf15..e89338e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -834,16 +834,9 @@ static const struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - int fd, r; - struct inode *inode; - struct file *file; - - r = anon_inode_getfd(&fd, &inode, &file, - "kvm-vcpu", &kvm_vcpu_fops, vcpu); - if (r) { + int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu); + if (fd < 0) kvm_put_kvm(vcpu->kvm); - return r; - } return fd; } @@ -1168,19 +1161,15 @@ static const struct file_operations kvm_vm_fops = { static int kvm_dev_ioctl_create_vm(void) { - int fd, r; - struct inode *inode; - struct file *file; + int fd; struct kvm *kvm; kvm = kvm_create_vm(); if (IS_ERR(kvm)) return PTR_ERR(kvm); - r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); - if (r) { + fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm); + if (fd < 0) kvm_put_kvm(kvm); - return r; - } return fd; } -- cgit v0.10.2 From 5c598b3428c372a1209597cee99a70da20625876 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Apr 2008 20:04:15 -0400 Subject: [PATCH] fix sysctl_nr_open bugs * if luser with root sets it to something that is not a multiple of BITS_PER_LONG, the system is screwed. * if it gets decreased at the wrong time, we can get expand_files() returning success and _not_ increasing the size of table as asked. Signed-off-by: Al Viro diff --git a/fs/file.c b/fs/file.c index f6fbcb4..4c6f0ea 100644 --- a/fs/file.c +++ b/fs/file.c @@ -150,8 +150,16 @@ static struct fdtable * alloc_fdtable(unsigned int nr) nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); - if (nr > sysctl_nr_open) - nr = sysctl_nr_open; + /* + * Note that this can drive nr *below* what we had passed if sysctl_nr_open + * had been set lower between the check in expand_files() and here. Deal + * with that in caller, it's cheaper that way. + * + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); if (!fdt) @@ -200,6 +208,16 @@ static int expand_fdtable(struct files_struct *files, int nr) if (!new_fdt) return -ENOMEM; /* + * extremely unlikely race - sysctl_nr_open decreased between the check in + * caller and alloc_fdtable(). Cheaper to catch it here... + */ + if (unlikely(new_fdt->max_fds <= nr)) { + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); + return -EMFILE; + } + /* * Check again since another task may have expanded the fd table while * we dropped the lock */ -- cgit v0.10.2