diff options
Diffstat (limited to 'ipc')
-rw-r--r-- | ipc/ipc_sysctl.c | 20 | ||||
-rw-r--r-- | ipc/msg.c | 69 | ||||
-rw-r--r-- | ipc/namespace.c | 9 | ||||
-rw-r--r-- | ipc/sem.c | 280 | ||||
-rw-r--r-- | ipc/shm.c | 264 | ||||
-rw-r--r-- | ipc/util.c | 129 | ||||
-rw-r--r-- | ipc/util.h | 24 |
7 files changed, 466 insertions, 329 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 130dfec..b0e99de 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -62,7 +62,7 @@ static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write, return err; } -static int proc_ipc_callback_dointvec(ctl_table *table, int write, +static int proc_ipc_callback_dointvec_minmax(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -72,7 +72,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos); + rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) /* @@ -152,15 +152,13 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, #define proc_ipc_dointvec NULL #define proc_ipc_dointvec_minmax NULL #define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_callback_dointvec NULL +#define proc_ipc_callback_dointvec_minmax NULL #define proc_ipcauto_dointvec_minmax NULL #endif static int zero; static int one = 1; -#ifdef CONFIG_CHECKPOINT_RESTORE static int int_max = INT_MAX; -#endif static struct ctl_table ipc_kern_table[] = { { @@ -198,21 +196,27 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof (init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_dointvec, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof (init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_callback_dointvec, + .proc_handler = proc_ipc_callback_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = proc_ipc_dointvec, + .proc_handler = proc_ipc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &int_max, }, { .procname = "sem", @@ -70,8 +70,6 @@ struct msg_sender { #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) - static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS @@ -167,12 +165,21 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) ipc_rmid(&msg_ids(ns), &s->q_perm); } +static void msg_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct msg_queue *msq = ipc_rcu_to_struct(p); + + security_msg_queue_free(msq); + ipc_rcu_free(head); +} + /** * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { @@ -191,15 +198,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); if (retval) { - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, ipc_rcu_free); return retval; } /* ipc_addid() locks msq upon success. */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id < 0) { - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); return id; } @@ -259,8 +265,8 @@ static void expunge_all(struct msg_queue *msq, int res) * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -270,19 +276,19 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); free_msg(msg); } atomic_sub(msq->q_cbytes, &ns->msg_bytes); - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); } /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked. */ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) { @@ -386,9 +392,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) } /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct msqid_ds __user *buf, int version) @@ -403,7 +409,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - down_write(&msg_ids(ns).rw_mutex); + down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, @@ -459,7 +465,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&msg_ids(ns).rw_mutex); + up_write(&msg_ids(ns).rwsem); return err; } @@ -494,7 +500,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - down_read(&msg_ids(ns).rw_mutex); + down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -505,7 +511,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); - up_read(&msg_ids(ns).rw_mutex); + up_read(&msg_ids(ns).rwsem); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; @@ -680,16 +686,24 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock1; } + ipc_lock_object(&msq->q_perm); + for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) - goto out_unlock1; + goto out_unlock0; + + /* raced with RMID? */ + if (msq->q_perm.deleted) { + err = -EIDRM; + goto out_unlock0; + } err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) - goto out_unlock1; + goto out_unlock0; if (msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes) { @@ -699,10 +713,9 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; - goto out_unlock1; + goto out_unlock0; } - ipc_lock_object(&msq->q_perm); ss_add(msq, &s); if (!ipc_rcu_getref(msq)) { @@ -717,7 +730,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, rcu_read_lock(); ipc_lock_object(&msq->q_perm); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, ipc_rcu_free); if (msq->q_perm.deleted) { err = -EIDRM; goto out_unlock0; @@ -730,10 +743,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, goto out_unlock0; } - ipc_unlock_object(&msq->q_perm); } - - ipc_lock_object(&msq->q_perm); msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); @@ -897,6 +907,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl goto out_unlock1; ipc_lock_object(&msq->q_perm); + + /* raced with RMID? */ + if (msq->q_perm.deleted) { + msg = ERR_PTR(-EIDRM); + goto out_unlock0; + } + msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* diff --git a/ipc/namespace.c b/ipc/namespace.c index 7ee61bf..59451c1 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, int next_id; int total, in_use; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); in_use = ids->in_use; @@ -89,11 +89,12 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; - ipc_lock_by_ptr(perm); + rcu_read_lock(); + ipc_lock_object(perm); free(ns, perm); total++; } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) @@ -171,7 +172,7 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) { struct ipc_namespace *ns = new; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* Ditch state from the old ipc namespace */ @@ -243,71 +243,122 @@ static void merge_queues(struct sem_array *sma) } } +static void sem_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct sem_array *sma = ipc_rcu_to_struct(p); + + security_sem_free(sma); + ipc_rcu_free(head); +} + +/* + * Wait until all currently ongoing simple ops have completed. + * Caller must own sem_perm.lock. + * New simple ops cannot start, because simple ops first check + * that sem_perm.lock is free. + * that a) sem_perm.lock is free and b) complex_count is 0. + */ +static void sem_wait_array(struct sem_array *sma) +{ + int i; + struct sem *sem; + + if (sma->complex_count) { + /* The thread that increased sma->complex_count waited on + * all sem->lock locks. Thus we don't need to wait again. + */ + return; + } + + for (i = 0; i < sma->sem_nsems; i++) { + sem = sma->sem_base + i; + spin_unlock_wait(&sem->lock); + } +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. * Otherwise, lock the entire semaphore array, since we either have * multiple semaphores in our own semops, or we need to look at * semaphores from other pending complex operations. - * - * Carefully guard against sma->complex_count changing between zero - * and non-zero while we are spinning for the lock. The value of - * sma->complex_count cannot change while we are holding the lock, - * so sem_unlock should be fine. - * - * The global lock path checks that all the local locks have been released, - * checking each local lock once. This means that the local lock paths - * cannot start their critical sections while the global lock is held. */ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, int nsops) { - int locknum; - again: - if (nsops == 1 && !sma->complex_count) { - struct sem *sem = sma->sem_base + sops->sem_num; + struct sem *sem; - /* Lock just the semaphore we are interested in. */ - spin_lock(&sem->lock); + if (nsops != 1) { + /* Complex operation - acquire a full lock */ + ipc_lock_object(&sma->sem_perm); - /* - * If sma->complex_count was set while we were spinning, - * we may need to look at things we did not lock here. + /* And wait until all simple ops that are processed + * right now have dropped their locks. */ - if (unlikely(sma->complex_count)) { - spin_unlock(&sem->lock); - goto lock_array; - } + sem_wait_array(sma); + return -1; + } + + /* + * Only one semaphore affected - try to optimize locking. + * The rules are: + * - optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * - The test for enqueued complex ops is simple: + * sma->complex_count != 0 + * - Testing for complex ops that are processed right now is + * a bit more difficult. Complex ops acquire the full lock + * and first wait that the running simple ops have completed. + * (see above) + * Thus: If we own a simple lock and the global lock is free + * and complex_count is now 0, then it will stay 0 and + * thus just locking sem->lock is sufficient. + */ + sem = sma->sem_base + sops->sem_num; + if (sma->complex_count == 0) { /* - * Another process is holding the global lock on the - * sem_array; we cannot enter our critical section, - * but have to wait for the global lock to be released. + * It appears that no complex operation is around. + * Acquire the per-semaphore lock. */ - if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { - spin_unlock(&sem->lock); - spin_unlock_wait(&sma->sem_perm.lock); - goto again; + spin_lock(&sem->lock); + + /* Then check that the global lock is free */ + if (!spin_is_locked(&sma->sem_perm.lock)) { + /* spin_is_locked() is not a memory barrier */ + smp_mb(); + + /* Now repeat the test of complex_count: + * It can't change anymore until we drop sem->lock. + * Thus: if is now 0, then it will stay 0. + */ + if (sma->complex_count == 0) { + /* fast path successful! */ + return sops->sem_num; + } } + spin_unlock(&sem->lock); + } - locknum = sops->sem_num; + /* slow path: acquire the full lock */ + ipc_lock_object(&sma->sem_perm); + + if (sma->complex_count == 0) { + /* False alarm: + * There is no complex operation, thus we can switch + * back to the fast path. + */ + spin_lock(&sem->lock); + ipc_unlock_object(&sma->sem_perm); + return sops->sem_num; } else { - int i; - /* - * Lock the semaphore array, and wait for all of the - * individual semaphore locks to go away. The code - * above ensures no new single-lock holders will enter - * their critical section while the array lock is held. + /* Not a false alarm, thus complete the sequence for a + * full lock. */ - lock_array: - ipc_lock_object(&sma->sem_perm); - for (i = 0; i < sma->sem_nsems; i++) { - struct sem *sem = sma->sem_base + i; - spin_unlock_wait(&sem->lock); - } - locknum = -1; + sem_wait_array(sma); + return -1; } - return locknum; } static inline void sem_unlock(struct sem_array *sma, int locknum) @@ -322,7 +373,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) } /* - * sem_lock_(check_) routines are called in the paths where the rw_mutex + * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. @@ -374,12 +425,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns static inline void sem_lock_and_putref(struct sem_array *sma) { sem_lock(sma, NULL, -1); - ipc_rcu_putref(sma); -} - -static inline void sem_putref(struct sem_array *sma) -{ - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); } static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -426,7 +472,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * - * Called with sem_ids.rw_mutex held (as a writer) + * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) @@ -458,14 +504,13 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_perm.security = NULL; retval = security_sem_alloc(sma); if (retval) { - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return retval; } id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (id < 0) { - security_sem_free(sma); - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, sem_rcu_free); return id; } ns->used_sems += nsems; @@ -492,7 +537,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) { @@ -503,7 +548,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) } /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -873,6 +918,24 @@ again: } /** + * set_semotime(sma, sops) - set sem_otime + * @sma: semaphore array + * @sops: operations that modified the array, may be NULL + * + * sem_otime is replicated to avoid cache line trashing. + * This function sets one instance to the current time. + */ +static void set_semotime(struct sem_array *sma, struct sembuf *sops) +{ + if (sops == NULL) { + sma->sem_base[0].sem_otime = get_seconds(); + } else { + sma->sem_base[sops[0].sem_num].sem_otime = + get_seconds(); + } +} + +/** * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue * @sma: semaphore array * @sops: operations that were performed @@ -922,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop } } } - if (otime) { - if (sops == NULL) { - sma->sem_base[0].sem_otime = get_seconds(); - } else { - sma->sem_base[sops[0].sem_num].sem_otime = - get_seconds(); - } - } + if (otime) + set_semotime(sma, sops); } - /* The following counts are associated to each semaphore: * semncnt number of tasks waiting on semval being nonzero * semzcnt number of tasks waiting on semval being zero @@ -994,8 +1050,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) return semzcnt; } -/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked - * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex +/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked + * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) @@ -1047,8 +1103,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) wake_up_sem_queue_do(&tasks); ns->used_sems -= sma->sem_nsems; - security_sem_free(sma); - ipc_rcu_putref(sma); + ipc_rcu_putref(sma, sem_rcu_free); } static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) @@ -1116,7 +1171,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - down_read(&sem_ids(ns).rw_mutex); + down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; @@ -1125,7 +1180,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semaem = SEMAEM; } max_id = ipc_get_maxid(&sem_ids(ns)); - up_read(&sem_ids(ns).rw_mutex); + up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -1227,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + return -EIDRM; + } + curr = &sma->sem_base[semnum]; ipc_assert_locked_object(&sma->sem_perm); @@ -1281,28 +1342,28 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } if(nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(sma)) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return -ENOMEM; } rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) @@ -1320,28 +1381,28 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, struct sem_undo *un; if (!ipc_rcu_getref(sma)) { - rcu_read_unlock(); - return -EIDRM; + err = -EIDRM; + goto out_rcu_wakeup; } rcu_read_unlock(); if(nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return -ENOMEM; } } if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); err = -EFAULT; goto out_free; } for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); err = -ERANGE; goto out_free; } @@ -1349,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } for (i = 0; i < nsems; i++) @@ -1376,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_rcu_wakeup; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1431,9 +1494,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) } /* - * This function handles some semctl commands which require the rw_mutex + * This function handles some semctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) @@ -1448,7 +1511,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - down_write(&sem_ids(ns).rw_mutex); + down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, @@ -1487,7 +1550,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&sem_ids(ns).rw_mutex); + up_write(&sem_ids(ns).rwsem); return err; } @@ -1629,7 +1692,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - sem_putref(sma); + ipc_rcu_putref(sma, ipc_rcu_free); return ERR_PTR(-ENOMEM); } @@ -1781,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (error) goto out_rcu_wakeup; + error = -EIDRM; + locknum = sem_lock(sma, sops, nsops); + if (sma->sem_perm.deleted) + goto out_unlock_free; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -1788,19 +1855,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ - error = -EIDRM; - locknum = sem_lock(sma, sops, nsops); if (un && un->semid == -1) goto out_unlock_free; error = perform_atomic_semop(sma, sops, nsops, un, task_tgid_vnr(current)); - if (error <= 0) { - if (alter && error == 0) + if (error == 0) { + /* If the operation was successful, then do + * the required updates. + */ + if (alter) do_smart_update(sma, sops, nsops, 1, &tasks); - - goto out_unlock_free; + else + set_semotime(sma, sops); } + if (error <= 0) + goto out_unlock_free; /* We need to sleep on this operation, so we put the current * task into the pending queue and go to sleep. @@ -1997,6 +2067,12 @@ void exit_sem(struct task_struct *tsk) } sem_lock(sma, NULL, -1); + /* exit_sem raced with IPC_RMID, nothing to do */ + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + continue; + } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created @@ -2059,6 +2135,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) struct sem_array *sma = it; time_t sem_otime; + /* + * The proc interface isn't aware of sem_lock(), it calls + * ipc_lock_object() directly (in sysvipc_find_ipc). + * In order to stay compatible with sem_lock(), we must wait until + * all simple semop() calls have left their critical regions. + */ + sem_wait_array(sma); + sem_otime = get_semotime(sma); return seq_printf(s, @@ -19,6 +19,9 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * Better ipc lock (kern_ipc_perm.lock) handling + * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013. */ #include <linux/slab.h> @@ -80,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns) } /* - * Called with shm_ids.rw_mutex (writer) and the shp structure locked. - * Only shm_ids.rw_mutex remains locked on exit. + * Called with shm_ids.rwsem (writer) and the shp structure locked. + * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -124,8 +127,28 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } +static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + /* - * shm_lock_(check_) routines are called in the paths where the rw_mutex + * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) @@ -144,15 +167,13 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) ipc_lock_object(&ipcp->shm_perm); } -static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, - int id) +static void shm_rcu_free(struct rcu_head *head) { - struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct shmid_kernel *shp = ipc_rcu_to_struct(p); - return container_of(ipcp, struct shmid_kernel, shm_perm); + security_shm_free(shp); + ipc_rcu_free(head); } static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) @@ -182,7 +203,7 @@ static void shm_open(struct vm_area_struct *vma) * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.rw_mutex (writer) locked, + * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) @@ -196,8 +217,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) user_shm_unlock(file_inode(shp->shm_file)->i_size, shp->mlock_user); fput (shp->shm_file); - security_shm_free(shp); - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, shm_rcu_free); } /* @@ -230,7 +250,7 @@ static void shm_close(struct vm_area_struct *vma) struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); BUG_ON(IS_ERR(shp)); @@ -241,10 +261,10 @@ static void shm_close(struct vm_area_struct *vma) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_current(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -275,7 +295,7 @@ static int shm_try_destroy_current(int id, void *p, void *data) return 0; } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -286,7 +306,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * We want to destroy segments without users and with already * exit'ed originating process. * - * As shp->* are changed under rw_mutex, it's safe to skip shp locking. + * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (shp->shm_creator != NULL) return 0; @@ -300,10 +320,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } @@ -315,10 +335,10 @@ void exit_shm(struct task_struct *task) return; /* Destroy all already created segments, but not mapped yet */ - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -452,7 +472,7 @@ static const struct vm_operations_struct shm_vm_ops = { * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * - * Called with shm_ids.rw_mutex held as a writer. + * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) @@ -485,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.security = NULL; error = security_shm_alloc(shp); if (error) { - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, ipc_rcu_free); return error; } @@ -554,13 +574,12 @@ no_id: user_shm_unlock(size, shp->mlock_user); fput(file); no_file: - security_shm_free(shp); - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, shm_rcu_free); return error; } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { @@ -571,7 +590,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -684,7 +703,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf /* * Calculate and add used RSS and swap pages of a shm. - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) @@ -711,7 +730,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, } /* - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) @@ -740,9 +759,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, } /* - * This function handles some shmctl commands which require the rw_mutex + * This function handles some shmctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid_ds __user *buf, int version) @@ -757,14 +776,13 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, - &shmid64.shm_perm, 0); + ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -772,14 +790,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: + ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) goto out_unlock0; @@ -787,6 +807,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: @@ -794,33 +815,28 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; } -SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +static int shmctl_nolock(struct ipc_namespace *ns, int shmid, + int cmd, int version, void __user *buf) { + int err; struct shmid_kernel *shp; - int err, version; - struct ipc_namespace *ns; - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; + /* preliminary security checks for *_INFO */ + if (cmd == IPC_INFO || cmd == SHM_INFO) { + err = security_shm_shmctl(NULL, cmd); + if (err) + return err; } - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; - - switch (cmd) { /* replace with proc interface ? */ + switch (cmd) { case IPC_INFO: { struct shminfo64 shminfo; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; shminfo.shmmax = ns->shm_ctlmax; @@ -830,9 +846,9 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if(err<0) err = 0; @@ -842,19 +858,15 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { struct shm_info shm_info; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shm_info, 0, sizeof(shm_info)); - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -869,27 +881,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) struct shmid64_ds tbuf; int result; + rcu_read_lock(); if (cmd == SHM_STAT) { - shp = shm_lock(ns, shmid); + shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = shp->shm_perm.id; } else { - shp = shm_lock_check(ns, shmid); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = 0; } + err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; + err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; @@ -899,43 +915,76 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; tbuf.shm_nattch = shp->shm_nattch; - shm_unlock(shp); - if(copy_shmid_to_user (buf, &tbuf, version)) + rcu_read_unlock(); + + if (copy_shmid_to_user(buf, &tbuf, version)) err = -EFAULT; else err = result; goto out; } + default: + return -EINVAL; + } + +out_unlock: + rcu_read_unlock(); +out: + return err; +} + +SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + struct shmid_kernel *shp; + int err, version; + struct ipc_namespace *ns; + + if (cmd < 0 || shmid < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case SHM_INFO: + case SHM_STAT: + case IPC_STAT: + return shmctl_nolock(ns, shmid, cmd, version, buf); + case IPC_RMID: + case IPC_SET: + return shmctl_down(ns, shmid, cmd, buf, version); case SHM_LOCK: case SHM_UNLOCK: { struct file *shm_file; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock1; + ipc_lock_object(&shp->shm_perm); if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); err = -EPERM; if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) - goto out_unlock; + goto out_unlock0; } - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock; - shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK) { struct user_struct *user = current_user(); @@ -944,32 +993,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } - goto out_unlock; + goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) - goto out_unlock; + goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; get_file(shm_file); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); + fput(shm_file); - goto out; - } - case IPC_RMID: - case IPC_SET: - err = shmctl_down(ns, shmid, cmd, buf, version); return err; + } default: return -EINVAL; } -out_unlock: - shm_unlock(shp); -out: +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); return err; } @@ -1037,10 +1085,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } err = -EACCES; @@ -1051,24 +1100,31 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, if (err) goto out_unlock; + ipc_lock_object(&shp->shm_perm); path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); - if (!sfd) - goto out_put_dentry; + if (!sfd) { + path_put(&path); + goto out_nattch; + } file = alloc_file(&path, f_mode, is_file_hugepages(shp->shm_file) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); - if (IS_ERR(file)) - goto out_free; + if (IS_ERR(file)) { + kfree(sfd); + path_put(&path); + goto out_nattch; + } file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; @@ -1094,7 +1150,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, addr > current->mm->start_stack - size - PAGE_SIZE * 5) goto invalid; } - + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); *raddr = addr; err = 0; @@ -1109,7 +1165,7 @@ out_fput: fput(file); out_nattch: - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; @@ -1117,20 +1173,13 @@ out_nattch: shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); - -out: + up_write(&shm_ids(ns).rwsem); return err; out_unlock: - shm_unlock(shp); - goto out; - -out_free: - kfree(sfd); -out_put_dentry: - path_put(&path); - goto out_nattch; + rcu_read_unlock(); +out: + return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) @@ -1235,8 +1284,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) #else /* CONFIG_MMU */ /* under NOMMU conditions, the exact address to be destroyed must be * given */ - retval = -EINVAL; - if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); retval = 0; } @@ -15,6 +15,29 @@ * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * General sysv ipc locking scheme: + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr + * tree. + * - perform initial checks (capabilities, auditing and permission, + * etc). + * - perform read-only operations, such as STAT, INFO commands. + * acquire the ipc lock (kern_ipc_perm.lock) through + * ipc_lock_object() + * - perform data updates, such as SET, RMID commands and + * mechanism-specific operations (semop/semtimedop, + * msgsnd/msgrcv, shmat/shmdt). + * drop the ipc lock, through ipc_unlock_object(). + * rcu_read_unlock() + * + * The ids->rwsem must be taken when: + * - creating, removing and iterating the existing entries in ipc + * identifier sets. + * - iterating through files under /proc/sysvipc/ + * + * Note that sems have a special fast path that avoids kern_ipc_perm.lock - + * see sem_lock(). */ #include <linux/mm.h> @@ -119,7 +142,7 @@ __initcall(ipc_init); void ipc_init_ids(struct ipc_ids *ids) { - init_rwsem(&ids->rw_mutex); + init_rwsem(&ids->rwsem); ids->in_use = 0; ids->seq = 0; @@ -174,7 +197,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * @ids: Identifier set * @key: The key to find * - * Requires ipc_ids.rw_mutex locked. + * Requires ipc_ids.rwsem locked. * Returns the LOCKED pointer to the ipc structure if found or NULL * if not. * If key is found ipc points to the owning ipc structure @@ -197,7 +220,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) continue; } - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } @@ -208,7 +232,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * ipc_get_maxid - get the last assigned id * @ids: IPC identifier set * - * Called with ipc_ids.rw_mutex held. + * Called with ipc_ids.rwsem held. */ int ipc_get_maxid(struct ipc_ids *ids) @@ -246,7 +270,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with writer ipc_ids.rw_mutex held. + * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { @@ -312,9 +336,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, { int err; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); err = ops->getnew(ns, params); - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -331,7 +355,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, * * On success, the IPC id is returned. * - * It is called with ipc_ids.rw_mutex and ipcp->lock held. + * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, @@ -376,7 +400,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ @@ -402,7 +426,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, } ipc_unlock(ipcp); } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -413,7 +437,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * @ids: IPC identifier set * @ipcp: ipc perm structure containing the identifier to remove * - * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ @@ -465,11 +489,6 @@ void ipc_free(void* ptr, int size) kfree(ptr); } -struct ipc_rcu { - struct rcu_head rcu; - atomic_t refcount; -} ____cacheline_aligned_in_smp; - /** * ipc_rcu_alloc - allocate ipc and rcu space * @size: size desired @@ -496,27 +515,24 @@ int ipc_rcu_getref(void *ptr) return atomic_inc_not_zero(&p->refcount); } -/** - * ipc_schedule_free - free ipc + rcu space - * @head: RCU callback structure for queued work - */ -static void ipc_schedule_free(struct rcu_head *head) -{ - vfree(container_of(head, struct ipc_rcu, rcu)); -} - -void ipc_rcu_putref(void *ptr) +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)) { struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; if (!atomic_dec_and_test(&p->refcount)) return; - if (is_vmalloc_addr(ptr)) { - call_rcu(&p->rcu, ipc_schedule_free); - } else { - kfree_rcu(p, rcu); - } + call_rcu(&p->rcu, func); +} + +void ipc_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); } /** @@ -621,7 +637,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id) } /** - * ipc_lock - Lock an ipc structure without rw_mutex held + * ipc_lock - Lock an ipc structure without rwsem held * @ids: IPC identifier set * @id: ipc id to look for * @@ -677,22 +693,6 @@ out: return out; } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) -{ - struct kern_ipc_perm *out; - - out = ipc_lock(ids, id); - if (IS_ERR(out)) - return out; - - if (ipc_checkid(out, id)) { - ipc_unlock(out); - return ERR_PTR(-EIDRM); - } - - return out; -} - /** * ipcget - Common sys_*get() code * @ns : namsepace @@ -733,7 +733,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) } /** - * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd * @ns: the ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve @@ -746,29 +746,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with the ipc lock held in case of success - * or an err-code without any lock held otherwise. + * - returns a pointer to the ipc object or otherwise, the corresponding error. * - * Call holding the both the rw_mutex and the rcu read lock. + * Call holding the both the rwsem and the rcu read lock. */ -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) -{ - struct kern_ipc_perm *ipcp; - - ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm); - if (IS_ERR(ipcp)) - goto out; - - spin_lock(&ipcp->lock); -out: - return ipcp; -} - struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; @@ -846,7 +830,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } } @@ -884,7 +869,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) * Take the lock - this will be released by the corresponding * call to stop(). */ - down_read(&ids->rw_mutex); + down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) @@ -911,7 +896,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ - up_read(&ids->rw_mutex); + up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) @@ -47,6 +47,13 @@ static inline void msg_exit_ns(struct ipc_namespace *ns) { } static inline void shm_exit_ns(struct ipc_namespace *ns) { } #endif +struct ipc_rcu { + struct rcu_head rcu; + atomic_t refcount; +} ____cacheline_aligned_in_smp; + +#define ipc_rcu_to_struct(p) ((void *)(p+1)) + /* * Structure that holds the parameters needed by the ipc operations * (see after) @@ -94,10 +101,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) #define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -/* must be called with ids->rw_mutex acquired for writing */ +/* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -/* must be called with ids->rw_mutex acquired for reading */ +/* must be called with ids->rwsem acquired for reading */ int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ @@ -120,7 +127,8 @@ void ipc_free(void* ptr, int size); */ void* ipc_rcu_alloc(int size); int ipc_rcu_getref(void *ptr); -void ipc_rcu_putref(void *ptr); +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)); +void ipc_rcu_free(struct rcu_head *head); struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id); @@ -131,9 +139,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /* On IA-64, we always use the "64-bit version" of the IPC structures. */ @@ -174,19 +179,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) assert_spin_locked(&perm->lock); } -static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -{ - rcu_read_lock(); - ipc_lock_object(perm); -} - static inline void ipc_unlock(struct kern_ipc_perm *perm) { ipc_unlock_object(perm); rcu_read_unlock(); } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); |