diff options
Diffstat (limited to 'ipc')
-rw-r--r-- | ipc/Makefile | 2 | ||||
-rw-r--r-- | ipc/ipc_sysctl.c | 68 | ||||
-rw-r--r-- | ipc/ipcns_notifier.c | 82 | ||||
-rw-r--r-- | ipc/mqueue.c | 3 | ||||
-rw-r--r-- | ipc/msg.c | 239 | ||||
-rw-r--r-- | ipc/namespace.c | 26 | ||||
-rw-r--r-- | ipc/sem.c | 159 | ||||
-rw-r--r-- | ipc/shm.c | 186 | ||||
-rw-r--r-- | ipc/util.c | 131 | ||||
-rw-r--r-- | ipc/util.h | 6 |
10 files changed, 533 insertions, 369 deletions
diff --git a/ipc/Makefile b/ipc/Makefile index 5fc5e33..65c3843 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 7f4235b..d349746 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -15,6 +15,8 @@ #include <linux/sysctl.h> #include <linux/uaccess.h> #include <linux/ipc_namespace.h> +#include <linux/msg.h> +#include "util.h" static void *get_ipc(ctl_table *table) { @@ -24,6 +26,27 @@ static void *get_ipc(ctl_table *table) return which; } +/* + * Routine that is called when a tunable has successfully been changed by + * hand and it has a callback routine registered on the ipc namespace notifier + * chain: we don't want such tunables to be recomputed anymore upon memory + * add/remove or ipc namespace creation/removal. + * They can come back to a recomputable state by being set to a <0 value. + */ +static void tunable_set_callback(int val) +{ + if (val >= 0) + unregister_ipcns_notifier(current->nsproxy->ipc_ns); + else { + /* + * Re-enable automatic recomputing only if not already + * enabled. + */ + recompute_msgmni(current->nsproxy->ipc_ns); + cond_register_ipcns_notifier(current->nsproxy->ipc_ns); + } +} + #ifdef CONFIG_PROC_FS static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -35,6 +58,24 @@ static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); } +static int proc_ipc_callback_dointvec(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table ipc_table; + size_t lenp_bef = *lenp; + int rc; + + memcpy(&ipc_table, table, sizeof(ipc_table)); + ipc_table.data = get_ipc(table); + + rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); + + if (write && !rc && lenp_bef == *lenp) + tunable_set_callback(*((int *)(ipc_table.data))); + + return rc; +} + static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -49,6 +90,7 @@ static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, #else #define proc_ipc_doulongvec_minmax NULL #define proc_ipc_dointvec NULL +#define proc_ipc_callback_dointvec NULL #endif #ifdef CONFIG_SYSCTL_SYSCALL @@ -90,8 +132,30 @@ static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, } return 1; } + +static int sysctl_ipc_registered_data(ctl_table *table, int __user *name, + int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + int rc; + + rc = sysctl_ipc_data(table, name, nlen, oldval, oldlenp, newval, + newlen); + + if (newval && newlen && rc > 0) { + /* + * Tunable has successfully been changed from userland + */ + int *data = get_ipc(table); + + tunable_set_callback(*data); + } + + return rc; +} #else #define sysctl_ipc_data NULL +#define sysctl_ipc_registered_data NULL #endif static struct ctl_table ipc_kern_table[] = { @@ -137,8 +201,8 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec, - .strategy = sysctl_ipc_data, + .proc_handler = proc_ipc_callback_dointvec, + .strategy = sysctl_ipc_registered_data, }, { .ctl_name = KERN_MSGMNB, diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c new file mode 100644 index 0000000..70ff091 --- /dev/null +++ b/ipc/ipcns_notifier.c @@ -0,0 +1,82 @@ +/* + * linux/ipc/ipcns_notifier.c + * Copyright (C) 2007 BULL SA. Nadia Derbey + * + * Notification mechanism for ipc namespaces: + * The callback routine registered in the memory chain invokes the ipcns + * notifier chain with the IPCNS_MEMCHANGED event. + * Each callback routine registered in the ipcns namespace recomputes msgmni + * for the owning namespace. + */ + +#include <linux/msg.h> +#include <linux/rcupdate.h> +#include <linux/notifier.h> +#include <linux/nsproxy.h> +#include <linux/ipc_namespace.h> + +#include "util.h" + + + +static BLOCKING_NOTIFIER_HEAD(ipcns_chain); + + +static int ipcns_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct ipc_namespace *ns; + + switch (action) { + case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ + case IPCNS_CREATED: + case IPCNS_REMOVED: + /* + * It's time to recompute msgmni + */ + ns = container_of(self, struct ipc_namespace, ipcns_nb); + /* + * No need to get a reference on the ns: the 1st job of + * free_ipc_ns() is to unregister the callback routine. + * blocking_notifier_chain_unregister takes the wr lock to do + * it. + * When this callback routine is called the rd lock is held by + * blocking_notifier_call_chain. + * So the ipc ns cannot be freed while we are here. + */ + recompute_msgmni(ns); + break; + default: + break; + } + + return NOTIFY_OK; +} + +int register_ipcns_notifier(struct ipc_namespace *ns) +{ + memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); + ns->ipcns_nb.notifier_call = ipcns_callback; + ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; + return blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); +} + +int cond_register_ipcns_notifier(struct ipc_namespace *ns) +{ + memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); + ns->ipcns_nb.notifier_call = ipcns_callback; + ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; + return blocking_notifier_chain_cond_register(&ipcns_chain, + &ns->ipcns_nb); +} + +int unregister_ipcns_notifier(struct ipc_namespace *ns) +{ + return blocking_notifier_chain_unregister(&ipcns_chain, + &ns->ipcns_nb); +} + +int ipcns_notify(unsigned long val) +{ + return blocking_notifier_call_chain(&ipcns_chain, val, NULL); +} diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 94fd3b0..b3b69fd 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -673,7 +673,7 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode, if (IS_ERR(name = getname(u_name))) return PTR_ERR(name); - fd = get_unused_fd(); + fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out_putname; @@ -709,7 +709,6 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode, goto out_putfd; } - set_close_on_exec(fd, 1); fd_install(fd, filp); goto out_upsem; @@ -27,6 +27,7 @@ #include <linux/msg.h> #include <linux/spinlock.h> #include <linux/init.h> +#include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/security.h> @@ -70,7 +71,6 @@ struct msg_sender { #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) #define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -#define msg_buildid(id, seq) ipc_buildid(id, seq) static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); @@ -78,11 +78,49 @@ static int newque(struct ipc_namespace *, struct ipc_params *); static int sysvipc_msg_proc_show(struct seq_file *s, void *it); #endif +/* + * Scale msgmni with the available lowmem size: the memory dedicated to msg + * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. + * Also take into account the number of nsproxies created so far. + * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. + */ +void recompute_msgmni(struct ipc_namespace *ns) +{ + struct sysinfo i; + unsigned long allowed; + int nb_ns; + + si_meminfo(&i); + allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) + / MSGMNB; + nb_ns = atomic_read(&nr_ipc_ns); + allowed /= nb_ns; + + if (allowed < MSGMNI) { + ns->msg_ctlmni = MSGMNI; + goto out_callback; + } + + if (allowed > IPCMNI / nb_ns) { + ns->msg_ctlmni = IPCMNI / nb_ns; + goto out_callback; + } + + ns->msg_ctlmni = allowed; + +out_callback: + + printk(KERN_INFO "msgmni has been set to %d for ipc namespace %p\n", + ns->msg_ctlmni, ns); +} + void msg_init_ns(struct ipc_namespace *ns) { ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; - ns->msg_ctlmni = MSGMNI; + + recompute_msgmni(ns); + atomic_set(&ns->msg_bytes, 0); atomic_set(&ns->msg_hdrs, 0); ipc_init_ids(&ns->ids[IPC_MSG_IDS]); @@ -104,21 +142,6 @@ void __init msg_init(void) } /* - * This routine is called in the paths where the rw_mutex is held to protect - * access to the idr tree. - */ -static inline struct msg_queue *msg_lock_check_down(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check_down(&msg_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; - - return container_of(ipcp, struct msg_queue, q_perm); -} - -/* * msg_lock_(check_) routines are called in the paths where the rw_mutex * is not held. */ @@ -186,7 +209,6 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) return id; } - msq->q_perm.id = msg_buildid(id, msq->q_perm.seq); msq->q_stime = msq->q_rtime = 0; msq->q_ctime = get_seconds(); msq->q_cbytes = msq->q_qnum = 0; @@ -324,19 +346,19 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) out.msg_rtime = in->msg_rtime; out.msg_ctime = in->msg_ctime; - if (in->msg_cbytes > USHRT_MAX) - out.msg_cbytes = USHRT_MAX; + if (in->msg_cbytes > USHORT_MAX) + out.msg_cbytes = USHORT_MAX; else out.msg_cbytes = in->msg_cbytes; out.msg_lcbytes = in->msg_cbytes; - if (in->msg_qnum > USHRT_MAX) - out.msg_qnum = USHRT_MAX; + if (in->msg_qnum > USHORT_MAX) + out.msg_qnum = USHORT_MAX; else out.msg_qnum = in->msg_qnum; - if (in->msg_qbytes > USHRT_MAX) - out.msg_qbytes = USHRT_MAX; + if (in->msg_qbytes > USHORT_MAX) + out.msg_qbytes = USHORT_MAX; else out.msg_qbytes = in->msg_qbytes; out.msg_lqbytes = in->msg_qbytes; @@ -351,31 +373,14 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) } } -struct msq_setbuf { - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; -}; - static inline unsigned long -copy_msqid_from_user(struct msq_setbuf *out, void __user *buf, int version) +copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { switch(version) { case IPC_64: - { - struct msqid64_ds tbuf; - - if (copy_from_user(&tbuf, buf, sizeof(tbuf))) + if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; - - out->qbytes = tbuf.msg_qbytes; - out->uid = tbuf.msg_perm.uid; - out->gid = tbuf.msg_perm.gid; - out->mode = tbuf.msg_perm.mode; - return 0; - } case IPC_OLD: { struct msqid_ds tbuf_old; @@ -383,14 +388,14 @@ copy_msqid_from_user(struct msq_setbuf *out, void __user *buf, int version) if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; - out->uid = tbuf_old.msg_perm.uid; - out->gid = tbuf_old.msg_perm.gid; - out->mode = tbuf_old.msg_perm.mode; + out->msg_perm.uid = tbuf_old.msg_perm.uid; + out->msg_perm.gid = tbuf_old.msg_perm.gid; + out->msg_perm.mode = tbuf_old.msg_perm.mode; if (tbuf_old.msg_qbytes == 0) - out->qbytes = tbuf_old.msg_lqbytes; + out->msg_qbytes = tbuf_old.msg_lqbytes; else - out->qbytes = tbuf_old.msg_qbytes; + out->msg_qbytes = tbuf_old.msg_qbytes; return 0; } @@ -399,10 +404,71 @@ copy_msqid_from_user(struct msq_setbuf *out, void __user *buf, int version) } } -asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) +/* + * This function handles some msgctl commands which require the rw_mutex + * to be held in write mode. + * NOTE: no locks must be held, the rw_mutex is taken inside this function. + */ +static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, + struct msqid_ds __user *buf, int version) { struct kern_ipc_perm *ipcp; - struct msq_setbuf uninitialized_var(setbuf); + struct msqid64_ds msqid64; + struct msg_queue *msq; + int err; + + if (cmd == IPC_SET) { + if (copy_msqid_from_user(&msqid64, buf, version)) + return -EFAULT; + } + + ipcp = ipcctl_pre_down(&msg_ids(ns), msqid, cmd, + &msqid64.msg_perm, msqid64.msg_qbytes); + if (IS_ERR(ipcp)) + return PTR_ERR(ipcp); + + msq = container_of(ipcp, struct msg_queue, q_perm); + + err = security_msg_queue_msgctl(msq, cmd); + if (err) + goto out_unlock; + + switch (cmd) { + case IPC_RMID: + freeque(ns, ipcp); + goto out_up; + case IPC_SET: + if (msqid64.msg_qbytes > ns->msg_ctlmnb && + !capable(CAP_SYS_RESOURCE)) { + err = -EPERM; + goto out_unlock; + } + + msq->q_qbytes = msqid64.msg_qbytes; + + ipc_update_perm(&msqid64.msg_perm, ipcp); + msq->q_ctime = get_seconds(); + /* sleeping receivers might be excluded by + * stricter permissions. + */ + expunge_all(msq, -EAGAIN); + /* sleeping senders might be able to send + * due to a larger queue size. + */ + ss_wakeup(&msq->q_senders, 0); + break; + default: + err = -EINVAL; + } +out_unlock: + msg_unlock(msq); +out_up: + up_write(&msg_ids(ns).rw_mutex); + return err; +} + +asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) +{ struct msg_queue *msq; int err, version; struct ipc_namespace *ns; @@ -498,82 +564,13 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return success_return; } case IPC_SET: - if (!buf) - return -EFAULT; - if (copy_msqid_from_user(&setbuf, buf, version)) - return -EFAULT; - break; case IPC_RMID: - break; + err = msgctl_down(ns, msqid, cmd, buf, version); + return err; default: return -EINVAL; } - down_write(&msg_ids(ns).rw_mutex); - msq = msg_lock_check_down(ns, msqid); - if (IS_ERR(msq)) { - err = PTR_ERR(msq); - goto out_up; - } - - ipcp = &msq->q_perm; - - err = audit_ipc_obj(ipcp); - if (err) - goto out_unlock_up; - if (cmd == IPC_SET) { - err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, - setbuf.mode); - if (err) - goto out_unlock_up; - } - - err = -EPERM; - if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) - /* We _could_ check for CAP_CHOWN above, but we don't */ - goto out_unlock_up; - - err = security_msg_queue_msgctl(msq, cmd); - if (err) - goto out_unlock_up; - - switch (cmd) { - case IPC_SET: - { - err = -EPERM; - if (setbuf.qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) - goto out_unlock_up; - - msq->q_qbytes = setbuf.qbytes; - - ipcp->uid = setbuf.uid; - ipcp->gid = setbuf.gid; - ipcp->mode = (ipcp->mode & ~S_IRWXUGO) | - (S_IRWXUGO & setbuf.mode); - msq->q_ctime = get_seconds(); - /* sleeping receivers might be excluded by - * stricter permissions. - */ - expunge_all(msq, -EAGAIN); - /* sleeping senders might be able to send - * due to a larger queue size. - */ - ss_wakeup(&msq->q_senders, 0); - msg_unlock(msq); - break; - } - case IPC_RMID: - freeque(ns, &msq->q_perm); - break; - } - err = 0; -out_up: - up_write(&msg_ids(ns).rw_mutex); - return err; -out_unlock_up: - msg_unlock(msq); - goto out_up; out_unlock: msg_unlock(msq); return err; diff --git a/ipc/namespace.c b/ipc/namespace.c index 1b96765..9171d94 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -20,10 +20,20 @@ static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns) if (ns == NULL) return ERR_PTR(-ENOMEM); + atomic_inc(&nr_ipc_ns); + sem_init_ns(ns); msg_init_ns(ns); shm_init_ns(ns); + /* + * msgmni has already been computed for the new ipc ns. + * Thus, do the ipcns creation notification before registering that + * new ipcns in the chain. + */ + ipcns_notify(IPCNS_CREATED); + register_ipcns_notifier(ns); + kref_init(&ns->kref); return ns; } @@ -79,8 +89,24 @@ void free_ipc_ns(struct kref *kref) struct ipc_namespace *ns; ns = container_of(kref, struct ipc_namespace, kref); + /* + * Unregistering the hotplug notifier at the beginning guarantees + * that the ipc namespace won't be freed while we are inside the + * callback routine. Since the blocking_notifier_chain_XXX routines + * hold a rw lock on the notifier list, unregister_ipcns_notifier() + * won't take the rw lock before blocking_notifier_call_chain() has + * released the rd lock. + */ + unregister_ipcns_notifier(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); kfree(ns); + atomic_dec(&nr_ipc_ns); + + /* + * Do the ipcns removal notification after decrementing nr_ipc_ns in + * order to have a correct value when recomputing msgmni. + */ + ipcns_notify(IPCNS_REMOVED); } @@ -91,7 +91,6 @@ #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) #define sem_checkid(sma, semid) ipc_checkid(&sma->sem_perm, semid) -#define sem_buildid(id, seq) ipc_buildid(id, seq) static int newary(struct ipc_namespace *, struct ipc_params *); static void freeary(struct ipc_namespace *, struct kern_ipc_perm *); @@ -142,21 +141,6 @@ void __init sem_init (void) } /* - * This routine is called in the paths where the rw_mutex is held to protect - * access to the idr tree. - */ -static inline struct sem_array *sem_lock_check_down(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check_down(&sem_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct sem_array *)ipcp; - - return container_of(ipcp, struct sem_array, sem_perm); -} - -/* * sem_lock_(check_) routines are called in the paths where the rw_mutex * is not held. */ @@ -181,6 +165,25 @@ static inline struct sem_array *sem_lock_check(struct ipc_namespace *ns, return container_of(ipcp, struct sem_array, sem_perm); } +static inline void sem_lock_and_putref(struct sem_array *sma) +{ + ipc_lock_by_ptr(&sma->sem_perm); + ipc_rcu_putref(sma); +} + +static inline void sem_getref_and_unlock(struct sem_array *sma) +{ + ipc_rcu_getref(sma); + ipc_unlock(&(sma)->sem_perm); +} + +static inline void sem_putref(struct sem_array *sma) +{ + ipc_lock_by_ptr(&sma->sem_perm); + ipc_rcu_putref(sma); + ipc_unlock(&(sma)->sem_perm); +} + static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) { ipc_rmid(&sem_ids(ns), &s->sem_perm); @@ -268,7 +271,6 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) } ns->used_sems += nsems; - sma->sem_perm.id = sem_buildid(id, sma->sem_perm.seq); sma->sem_base = (struct sem *) &sma[1]; /* sma->sem_pending = NULL; */ sma->sem_pending_last = &sma->sem_pending; @@ -700,19 +702,15 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; if(nsems > SEMMSL_FAST) { - ipc_rcu_getref(sma); - sem_unlock(sma); + sem_getref_and_unlock(sma); sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); return -ENOMEM; } - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); + sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); err = -EIDRM; @@ -733,38 +731,30 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; struct sem_undo *un; - ipc_rcu_getref(sma); - sem_unlock(sma); + sem_getref_and_unlock(sma); if(nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); return -ENOMEM; } } if (copy_from_user (sem_io, arg.array, nsems*sizeof(ushort))) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); err = -ERANGE; goto out_free; } } - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); + sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); err = -EIDRM; @@ -830,28 +820,14 @@ out_free: return err; } -struct sem_setbuf { - uid_t uid; - gid_t gid; - mode_t mode; -}; - -static inline unsigned long copy_semid_from_user(struct sem_setbuf *out, void __user *buf, int version) +static inline unsigned long +copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) { switch(version) { case IPC_64: - { - struct semid64_ds tbuf; - - if(copy_from_user(&tbuf, buf, sizeof(tbuf))) + if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; - - out->uid = tbuf.sem_perm.uid; - out->gid = tbuf.sem_perm.gid; - out->mode = tbuf.sem_perm.mode; - return 0; - } case IPC_OLD: { struct semid_ds tbuf_old; @@ -859,9 +835,9 @@ static inline unsigned long copy_semid_from_user(struct sem_setbuf *out, void __ if(copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; - out->uid = tbuf_old.sem_perm.uid; - out->gid = tbuf_old.sem_perm.gid; - out->mode = tbuf_old.sem_perm.mode; + out->sem_perm.uid = tbuf_old.sem_perm.uid; + out->sem_perm.gid = tbuf_old.sem_perm.gid; + out->sem_perm.mode = tbuf_old.sem_perm.mode; return 0; } @@ -870,38 +846,29 @@ static inline unsigned long copy_semid_from_user(struct sem_setbuf *out, void __ } } -static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, - int cmd, int version, union semun arg) +/* + * This function handles some semctl commands which require the rw_mutex + * to be held in write mode. + * NOTE: no locks must be held, the rw_mutex is taken inside this function. + */ +static int semctl_down(struct ipc_namespace *ns, int semid, + int cmd, int version, union semun arg) { struct sem_array *sma; int err; - struct sem_setbuf uninitialized_var(setbuf); + struct semid64_ds semid64; struct kern_ipc_perm *ipcp; if(cmd == IPC_SET) { - if(copy_semid_from_user (&setbuf, arg.buf, version)) + if (copy_semid_from_user(&semid64, arg.buf, version)) return -EFAULT; } - sma = sem_lock_check_down(ns, semid); - if (IS_ERR(sma)) - return PTR_ERR(sma); - ipcp = &sma->sem_perm; - - err = audit_ipc_obj(ipcp); - if (err) - goto out_unlock; + ipcp = ipcctl_pre_down(&sem_ids(ns), semid, cmd, &semid64.sem_perm, 0); + if (IS_ERR(ipcp)) + return PTR_ERR(ipcp); - if (cmd == IPC_SET) { - err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode); - if (err) - goto out_unlock; - } - if (current->euid != ipcp->cuid && - current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { - err=-EPERM; - goto out_unlock; - } + sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(sma, cmd); if (err) @@ -910,26 +877,19 @@ static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, switch(cmd){ case IPC_RMID: freeary(ns, ipcp); - err = 0; - break; + goto out_up; case IPC_SET: - ipcp->uid = setbuf.uid; - ipcp->gid = setbuf.gid; - ipcp->mode = (ipcp->mode & ~S_IRWXUGO) - | (setbuf.mode & S_IRWXUGO); + ipc_update_perm(&semid64.sem_perm, ipcp); sma->sem_ctime = get_seconds(); - sem_unlock(sma); - err = 0; break; default: - sem_unlock(sma); err = -EINVAL; - break; } - return err; out_unlock: sem_unlock(sma); +out_up: + up_write(&sem_ids(ns).rw_mutex); return err; } @@ -963,9 +923,7 @@ asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg) return err; case IPC_RMID: case IPC_SET: - down_write(&sem_ids(ns).rw_mutex); - err = semctl_down(ns,semid,semnum,cmd,version,arg); - up_write(&sem_ids(ns).rw_mutex); + err = semctl_down(ns, semid, cmd, version, arg); return err; default: return -EINVAL; @@ -1044,14 +1002,11 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) return ERR_PTR(PTR_ERR(sma)); nsems = sma->sem_nsems; - ipc_rcu_getref(sma); - sem_unlock(sma); + sem_getref_and_unlock(sma); new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); return ERR_PTR(-ENOMEM); } new->semadj = (short *) &new[1]; @@ -1062,13 +1017,10 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) if (un) { spin_unlock(&ulp->lock); kfree(new); - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); - sem_unlock(sma); + sem_putref(sma); goto out; } - ipc_lock_by_ptr(&sma->sem_perm); - ipc_rcu_putref(sma); + sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); spin_unlock(&ulp->lock); @@ -1298,6 +1250,7 @@ void exit_sem(struct task_struct *tsk) undo_list = tsk->sysvsem.undo_list; if (!undo_list) return; + tsk->sysvsem.undo_list = NULL; if (!atomic_dec_and_test(&undo_list->refcnt)) return; @@ -60,7 +60,6 @@ static struct vm_operations_struct shm_vm_ops; #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) -#define shm_buildid(id, seq) ipc_buildid(id, seq) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); @@ -127,18 +126,6 @@ static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns, return container_of(ipcp, struct shmid_kernel, shm_perm); } -static inline struct shmid_kernel *shm_lock_check_down( - struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check_down(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - /* * shm_lock_(check_) routines are called in the paths where the rw_mutex * is not held. @@ -169,12 +156,6 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ipc_rmid(&shm_ids(ns), &s->shm_perm); } -static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp) -{ - return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); -} - - /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) @@ -416,7 +397,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (IS_ERR(file)) goto no_file; - id = shm_addid(ns, shp); + id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (id < 0) { error = id; goto no_id; @@ -428,7 +409,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_ctim = get_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; - shp->shm_perm.id = shm_buildid(id, shp->shm_perm.seq); shp->shm_file = file; /* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -519,28 +499,14 @@ static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ } } -struct shm_setbuf { - uid_t uid; - gid_t gid; - mode_t mode; -}; - -static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __user *buf, int version) +static inline unsigned long +copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { switch(version) { case IPC_64: - { - struct shmid64_ds tbuf; - - if (copy_from_user(&tbuf, buf, sizeof(tbuf))) + if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; - - out->uid = tbuf.shm_perm.uid; - out->gid = tbuf.shm_perm.gid; - out->mode = tbuf.shm_perm.mode; - return 0; - } case IPC_OLD: { struct shmid_ds tbuf_old; @@ -548,9 +514,9 @@ static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; - out->uid = tbuf_old.shm_perm.uid; - out->gid = tbuf_old.shm_perm.gid; - out->mode = tbuf_old.shm_perm.mode; + out->shm_perm.uid = tbuf_old.shm_perm.uid; + out->shm_perm.gid = tbuf_old.shm_perm.gid; + out->shm_perm.mode = tbuf_old.shm_perm.mode; return 0; } @@ -624,9 +590,53 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, } } -asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) +/* + * This function handles some shmctl commands which require the rw_mutex + * to be held in write mode. + * NOTE: no locks must be held, the rw_mutex is taken inside this function. + */ +static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, + struct shmid_ds __user *buf, int version) +{ + struct kern_ipc_perm *ipcp; + struct shmid64_ds shmid64; + struct shmid_kernel *shp; + int err; + + if (cmd == IPC_SET) { + if (copy_shmid_from_user(&shmid64, buf, version)) + return -EFAULT; + } + + ipcp = ipcctl_pre_down(&shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); + if (IS_ERR(ipcp)) + return PTR_ERR(ipcp); + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock; + switch (cmd) { + case IPC_RMID: + do_shm_rmid(ns, ipcp); + goto out_up; + case IPC_SET: + ipc_update_perm(&shmid64.shm_perm, ipcp); + shp->shm_ctim = get_seconds(); + break; + default: + err = -EINVAL; + } +out_unlock: + shm_unlock(shp); +out_up: + up_write(&shm_ids(ns).rw_mutex); + return err; +} + +asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) { - struct shm_setbuf setbuf; struct shmid_kernel *shp; int err, version; struct ipc_namespace *ns; @@ -783,97 +793,13 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) goto out; } case IPC_RMID: - { - /* - * We cannot simply remove the file. The SVID states - * that the block remains until the last person - * detaches from it, then is deleted. A shmat() on - * an RMID segment is legal in older Linux and if - * we change it apps break... - * - * Instead we set a destroyed flag, and then blow - * the name away when the usage hits zero. - */ - down_write(&shm_ids(ns).rw_mutex); - shp = shm_lock_check_down(ns, shmid); - if (IS_ERR(shp)) { - err = PTR_ERR(shp); - goto out_up; - } - - err = audit_ipc_obj(&(shp->shm_perm)); - if (err) - goto out_unlock_up; - - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { - err=-EPERM; - goto out_unlock_up; - } - - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock_up; - - do_shm_rmid(ns, &shp->shm_perm); - up_write(&shm_ids(ns).rw_mutex); - goto out; - } - case IPC_SET: - { - if (!buf) { - err = -EFAULT; - goto out; - } - - if (copy_shmid_from_user (&setbuf, buf, version)) { - err = -EFAULT; - goto out; - } - down_write(&shm_ids(ns).rw_mutex); - shp = shm_lock_check_down(ns, shmid); - if (IS_ERR(shp)) { - err = PTR_ERR(shp); - goto out_up; - } - err = audit_ipc_obj(&(shp->shm_perm)); - if (err) - goto out_unlock_up; - err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode); - if (err) - goto out_unlock_up; - err=-EPERM; - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { - goto out_unlock_up; - } - - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock_up; - - shp->shm_perm.uid = setbuf.uid; - shp->shm_perm.gid = setbuf.gid; - shp->shm_perm.mode = (shp->shm_perm.mode & ~S_IRWXUGO) - | (setbuf.mode & S_IRWXUGO); - shp->shm_ctim = get_seconds(); - break; - } - + err = shmctl_down(ns, shmid, cmd, buf, version); + return err; default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = 0; -out_unlock_up: - shm_unlock(shp); -out_up: - up_write(&shm_ids(ns).rw_mutex); - goto out; out_unlock: shm_unlock(shp); out: @@ -33,6 +33,7 @@ #include <linux/audit.h> #include <linux/nsproxy.h> #include <linux/rwsem.h> +#include <linux/memory.h> #include <linux/ipc_namespace.h> #include <asm/unistd.h> @@ -52,11 +53,57 @@ struct ipc_namespace init_ipc_ns = { }, }; +atomic_t nr_ipc_ns = ATOMIC_INIT(1); + + +#ifdef CONFIG_MEMORY_HOTPLUG + +static void ipc_memory_notifier(struct work_struct *work) +{ + ipcns_notify(IPCNS_MEMCHANGED); +} + +static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); + + +static int ipc_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: /* memory successfully brought online */ + case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ + /* + * This is done by invoking the ipcns notifier chain with the + * IPC_MEMCHANGED event. + * In order not to keep the lock on the hotplug memory chain + * for too long, queue a work item that will, when waken up, + * activate the ipcns notification chain. + * No need to keep several ipc work items on the queue. + */ + if (!work_pending(&ipc_memory_wq)) + schedule_work(&ipc_memory_wq); + break; + case MEM_GOING_ONLINE: + case MEM_GOING_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + default: + break; + } + + return NOTIFY_OK; +} + +#endif /* CONFIG_MEMORY_HOTPLUG */ + /** * ipc_init - initialise IPC subsystem * * The various system5 IPC resources (semaphores, messages and shared * memory) are initialised + * A callback routine is registered into the memory hotplug notifier + * chain: since msgmni scales to lowmem this callback routine will be + * called upon successful memory add / remove to recompute msmgni. */ static int __init ipc_init(void) @@ -64,6 +111,8 @@ static int __init ipc_init(void) sem_init(); msg_init(); shm_init(); + hotplug_memory_notifier(ipc_memory_callback, IPC_CALLBACK_PRI); + register_ipcns_notifier(&init_ipc_ns); return 0; } __initcall(ipc_init); @@ -84,8 +133,8 @@ void ipc_init_ids(struct ipc_ids *ids) ids->seq = 0; { int seq_limit = INT_MAX/SEQ_MULTIPLIER; - if(seq_limit > USHRT_MAX) - ids->seq_max = USHRT_MAX; + if (seq_limit > USHORT_MAX) + ids->seq_max = USHORT_MAX; else ids->seq_max = seq_limit; } @@ -116,13 +165,12 @@ void __init ipc_init_proc_interface(const char *path, const char *header, iface->ids = ids; iface->show = show; - pde = create_proc_entry(path, - S_IRUGO, /* world readable */ - NULL /* parent dir */); - if (pde) { - pde->data = iface; - pde->proc_fops = &sysvipc_proc_fops; - } else { + pde = proc_create_data(path, + S_IRUGO, /* world readable */ + NULL, /* parent dir */ + &sysvipc_proc_fops, + iface); + if (!pde) { kfree(iface); } } @@ -231,6 +279,7 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) if(ids->seq > ids->seq_max) ids->seq = 0; + new->id = ipc_buildid(id, new->seq); spin_lock_init(&new->lock); new->deleted = 0; rcu_read_lock(); @@ -761,6 +810,70 @@ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, return ipcget_public(ns, ids, ops, params); } +/** + * ipc_update_perm - update the permissions of an IPC. + * @in: the permission given as input. + * @out: the permission of the ipc to set. + */ +void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) +{ + out->uid = in->uid; + out->gid = in->gid; + out->mode = (out->mode & ~S_IRWXUGO) + | (in->mode & S_IRWXUGO); +} + +/** + * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * @ids: the table of ids where to look for the ipc + * @id: the id of the ipc to retrieve + * @cmd: the cmd to check + * @perm: the permission to set + * @extra_perm: one extra permission parameter used by msq + * + * This function does some common audit and permissions check for some IPC_XXX + * cmd and is called from semctl_down, shmctl_down and msgctl_down. + * It must be called without any lock held and + * - retrieves the ipc with the given id in the given table. + * - performs some audit and permission check, depending on the given cmd + * - returns the ipc with both ipc and rw_mutex locks held in case of success + * or an err-code without any lock held otherwise. + */ +struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) +{ + struct kern_ipc_perm *ipcp; + int err; + + down_write(&ids->rw_mutex); + ipcp = ipc_lock_check_down(ids, id); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + goto out_up; + } + + err = audit_ipc_obj(ipcp); + if (err) + goto out_unlock; + + if (cmd == IPC_SET) { + err = audit_ipc_set_perm(extra_perm, perm->uid, + perm->gid, perm->mode); + if (err) + goto out_unlock; + } + if (current->euid == ipcp->cuid || + current->euid == ipcp->uid || capable(CAP_SYS_ADMIN)) + return ipcp; + + err = -EPERM; +out_unlock: + ipc_unlock(ipcp); +out_up: + up_write(&ids->rw_mutex); + return ERR_PTR(err); +} + #ifdef __ARCH_WANT_IPC_PARSE_VERSION @@ -12,7 +12,6 @@ #include <linux/err.h> -#define USHRT_MAX 0xffff #define SEQ_MULTIPLIER (IPCMNI) void sem_init (void); @@ -112,6 +111,9 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); +void ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); +struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm); #if defined(__ia64__) || defined(__x86_64__) || defined(__hppa__) || defined(__XTENSA__) /* On IA-64, we always use the "64-bit version" of the IPC structures. */ @@ -124,6 +126,8 @@ extern void free_msg(struct msg_msg *msg); extern struct msg_msg *load_msg(const void __user *src, int len); extern int store_msg(void __user *dest, struct msg_msg *msg, int len); +extern void recompute_msgmni(struct ipc_namespace *); + static inline int ipc_buildid(int id, int seq) { return SEQ_MULTIPLIER * seq + id; |