From f9f0a7d0dcbd19e9705e8b96a4b408f035e25c93 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Mon, 8 Jul 2013 15:59:35 -0700 Subject: drivers/dma/iop-adma.c: fix new warnings The recent "drivers/dma: remove unused support for MEMSET operations" change has fallout from lack of build testing by the author. This fixes: drivers/dma/iop-adma.c:1020:13: warning: unused variable 'dma_addr' [-Wunused-variable] drivers/dma/iop-adma.c:1519:2: warning: format '%s' expects a matching 'char *' argument [-Wformat=] Signed-off-by: Olof Johansson Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Sebastian Hesselbarth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index c9cc08c..cc727ec 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -1017,7 +1017,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) struct page *xor_srcs[IOP_ADMA_NUM_SRC_TEST]; struct page *zero_sum_srcs[IOP_ADMA_NUM_SRC_TEST + 1]; dma_addr_t dma_srcs[IOP_ADMA_NUM_SRC_TEST + 1]; - dma_addr_t dma_addr, dest_dma; + dma_addr_t dest_dma; struct dma_async_tx_descriptor *tx; struct dma_chan *dma_chan; dma_cookie_t cookie; @@ -1516,7 +1516,7 @@ static int iop_adma_probe(struct platform_device *pdev) goto err_free_iop_chan; } - dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s%s)\n", + dev_info(&pdev->dev, "Intel(R) IOP: ( %s%s%s%s%s%s)\n", dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "pq " : "", dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", -- cgit v0.10.2 From 79f6530cb59e2a0af6953742a33cc29e98ca631c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2013 15:59:36 -0700 Subject: audit: fix mq_open and mq_unlink to add the MQ root as a hidden parent audit_names record The old audit PATH records for mq_open looked like this: type=PATH msg=audit(1366282323.982:869): item=1 name=(null) inode=6777 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282323.982:869): item=0 name="test_mq" inode=26732 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 ...with the audit related changes that went into 3.7, they now look like this: type=PATH msg=audit(1366282236.776:3606): item=2 name=(null) inode=66655 dev=00:0c mode=0100700 ouid=0 ogid=0 rdev=00:00 obj=staff_u:object_r:user_tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=1 name=(null) inode=6926 dev=00:0c mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmpfs_t:s15:c0.c1023 type=PATH msg=audit(1366282236.776:3606): item=0 name="test_mq" Both of these look wrong to me. As Steve Grubb pointed out: "What we need is 1 PATH record that identifies the MQ. The other PATH records probably should not be there." Fix it to record the mq root as a parent, and flag it such that it should be hidden from view when the names are logged, since the root of the mq filesystem isn't terribly interesting. With this change, we get a single PATH record that looks more like this: type=PATH msg=audit(1368021604.836:484): item=0 name="test_mq" inode=16914 dev=00:0c mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmpfs_t:s0 In order to do this, a new audit_inode_parent_hidden() function is added. If we do it this way, then we avoid having the existing callers of audit_inode needing to do any sort of flag conversion if auditing is inactive. Signed-off-by: Jeff Layton Reported-by: Jiri Jaburek Cc: Steve Grubb Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/audit.h b/include/linux/audit.h index b20b038..729a4d1 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,8 +103,11 @@ extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void audit_putname(struct filename *name); + +#define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ +#define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ extern void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent); + unsigned int flags); extern void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type); @@ -148,10 +151,22 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_inode(struct filename *name, const struct dentry *dentry, +static inline void audit_inode(struct filename *name, + const struct dentry *dentry, unsigned int parent) { + if (unlikely(!audit_dummy_context())) { + unsigned int flags = 0; + if (parent) + flags |= AUDIT_INODE_PARENT; + __audit_inode(name, dentry, flags); + } +} +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ if (unlikely(!audit_dummy_context())) - __audit_inode(name, dentry, parent); + __audit_inode(name, dentry, + AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -311,7 +326,7 @@ static inline void audit_putname(struct filename *name) { } static inline void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { } static inline void __audit_inode_child(const struct inode *parent, const struct dentry *dentry, @@ -321,6 +336,9 @@ static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int parent) { } +static inline void audit_inode_parent_hidden(struct filename *name, + const struct dentry *dentry) +{ } static inline void audit_inode_child(const struct inode *parent, const struct dentry *dentry, const unsigned char type) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index e4e47f6..ae1996d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -823,6 +823,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, error = ro; goto out; } + audit_inode_parent_hidden(name, root); filp = do_create(ipc_ns, root->d_inode, &path, oflag, mode, u_attr ? &attr : NULL); @@ -868,6 +869,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) if (IS_ERR(name)) return PTR_ERR(name); + audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131..123c9b7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names { struct filename *name; int name_len; /* number of chars to log */ + bool hidden; /* don't log this record */ bool name_put; /* call __putname()? */ unsigned long ino; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601..9845cb3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } i = 0; - list_for_each_entry(n, &context->names_list, list) + list_for_each_entry(n, &context->names_list, list) { + if (n->hidden) + continue; audit_log_name(context, n, NULL, i++, &call_panic); + } /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name) * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, - unsigned int parent) + unsigned int flags) { struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; struct audit_names *n; + bool parent = flags & AUDIT_INODE_PARENT; if (!context->in_syscall) return; @@ -1831,6 +1835,8 @@ out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; + if (flags & AUDIT_INODE_HIDDEN) + n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; -- cgit v0.10.2 From 6beb8a23b50d38a003e80c5f16b50c56e8ae3387 Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Mon, 8 Jul 2013 15:59:37 -0700 Subject: kernel/auditfilter.c: fixing build warning kernel/auditfilter.c:426: warning: this decimal constant is unsigned only in ISO C90 Signed-off-by: Raphael S. Carvalho Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90..0ee9eff 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v0.10.2 From 2f992ee85aaa7dfd2bda43efe4493af1e108d054 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 8 Jul 2013 15:59:38 -0700 Subject: kernel/auditfilter.c: fix leak in audit_add_rule() error path If both 'tree' and 'watch' are valid we must call audit_put_tree(), just like the preceding code within audit_add_rule(). Signed-off-by: Chen Gang Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0ee9eff..3d15c66 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry) err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); + /* + * normally audit_add_tree_rule() will free it + * on failure + */ + if (tree) + audit_put_tree(tree); goto error; } } -- cgit v0.10.2 From b9ce54c9f59894e787e3067d2f758c297fcd6fd0 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 8 Jul 2013 15:59:39 -0700 Subject: audit: Fix decimal constant description Use proper decimal type for comparison with u32. Compilation warning was introduced by 780a7654 ("audit: Make testing for a valid loginuid explicit.") kernel/auditfilter.c: In function 'audit_data_to_entry': kernel/auditfilter.c:426:3: warning: this decimal constant is unsigned only in ISO C90 [enabled by default] if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { Signed-off-by: Michal Simek Cc: Al Viro Cc: Eric Paris Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 3d15c66..f7aee8b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } -- cgit v0.10.2 From de1e0c40aceb9d5bff09c3a3b97b2f1b178af53f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 15:59:40 -0700 Subject: fanotify: info leak in copy_event_to_user() The ->reserved field isn't cleared so we leak one byte of stack information to userspace. Signed-off-by: Dan Carpenter Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1ea52f7..e16076d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; + metadata->reserved = 0; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); if (unlikely(event->mask & FAN_Q_OVERFLOW)) -- cgit v0.10.2 From 7b18527c4a95397b443c8c22f75634d5d11c9d47 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:42 -0700 Subject: fanotify: fix races when adding/removing marks For both adding an event to an existing mark and destroying a mark we first have to find it via fsnotify_find_[inode|vfsmount]_mark(). But getting the mark and adding an event (or destroying it) is not done atomically. This opens a race where a thread is about to destroy a mark while another thread still finds the same mark and adds an event to its mask although it will be destroyed. Another race exists concerning the excess of a groups number of marks limit: When a mark is added the number of group marks is checked against the max number of marks per group and increased afterwards. Since check and increment is also not done atomically, this may result in 2 or more processes passing the check at the same time and increasing the number of group marks above the allowed limit. With this patch both races are avoided by doing the concerning operations with the groups mark mutex locked. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e16076d..4e1d8ec 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -524,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); if (removed & real_mount(mnt)->mnt_fsnotify_mask) @@ -548,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, __u32 removed; int destroy_mark; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOENT; + } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); if (destroy_mark) - fsnotify_destroy_mark(fsn_mark, group); + fsnotify_destroy_mark_locked(fsn_mark, group); + mutex_unlock(&group->mark_mutex); + /* matches the fsnotify_find_inode_mark() */ fsnotify_put_mark(fsn_mark); if (removed & inode->i_fsnotify_mask) @@ -599,21 +608,29 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, __u32 added; int ret = 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + mutex_unlock(&group->mark_mutex); return -ENOSPC; + } fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOMEM; + } fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) + ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0); + if (ret) { + mutex_unlock(&group->mark_mutex); goto err; + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); @@ -642,21 +659,29 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, (atomic_read(&inode->i_writecount) > 0)) return 0; + mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + mutex_unlock(&group->mark_mutex); return -ENOSPC; + } fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) + if (!fsn_mark) { + mutex_unlock(&group->mark_mutex); return -ENOMEM; + } fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) + ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0); + if (ret) { + mutex_unlock(&group->mark_mutex); goto err; + } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); + mutex_unlock(&group->mark_mutex); if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); -- cgit v0.10.2 From 5e9c070ca085439fbec9e9629dd6171ae325d4d8 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:43 -0700 Subject: fanotify: put duplicate code for adding vfsmount/inode marks into an own function The code under the groups mark_mutex in fanotify_add_inode_mark() and fanotify_add_vfsmount_mark() is almost identical. So put it into a seperate function. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e1d8ec..e44cb64 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -600,33 +600,45 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, return mask & ~oldmask; } +static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark *mark; + int ret; + + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return ERR_PTR(-ENOSPC); + + mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); + if (!mark) + return ERR_PTR(-ENOMEM); + + fsnotify_init_mark(mark, fanotify_free_mark); + ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + if (ret) { + fsnotify_put_mark(mark); + return ERR_PTR(ret); + } + + return mark; +} + + static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags) { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { - mutex_unlock(&group->mark_mutex); - return -ENOSPC; - } - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); - return -ENOMEM; - } - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(fsn_mark, group, NULL, mnt, 0); - if (ret) { + fsn_mark = fanotify_add_new_mark(group, NULL, mnt); + if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); - goto err; + return PTR_ERR(fsn_mark); } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); @@ -634,9 +646,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, if (added & ~real_mount(mnt)->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -645,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; - int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -662,22 +673,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) { + fsn_mark = fanotify_add_new_mark(group, inode, NULL); + if (IS_ERR(fsn_mark)) { mutex_unlock(&group->mark_mutex); - return -ENOSPC; - } - - fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!fsn_mark) { - mutex_unlock(&group->mark_mutex); - return -ENOMEM; - } - - fsnotify_init_mark(fsn_mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(fsn_mark, group, inode, NULL, 0); - if (ret) { - mutex_unlock(&group->mark_mutex); - goto err; + return PTR_ERR(fsn_mark); } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); @@ -685,9 +684,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); -err: + fsnotify_put_mark(fsn_mark); - return ret; + return 0; } /* fanotify syscalls */ -- cgit v0.10.2 From 52f85729805b7a0ec5a7a70e2c814193929de2f0 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:44 -0700 Subject: dnotify: replace dnotify_mark_mutex with mark mutex of dnotify_group There is no need to use a special mutex to protect against the fcntl/close race (see dnotify.c for a description of this race). Instead the dnotify_groups mark mutex can be used. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2bfe6dc..1fedd5f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1; static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; static struct fsnotify_group *dnotify_group __read_mostly; -static DEFINE_MUTEX(dnotify_mark_mutex); /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which @@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); spin_lock(&fsn_mark->lock); prev = &dn_mark->dn; @@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id) spin_unlock(&fsn_mark->lock); - /* nothing else could have found us thanks to the dnotify_mark_mutex */ + /* nothing else could have found us thanks to the dnotify_groups + mark_mutex */ if (dn_mark->dn == NULL) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); } @@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) new_dn_mark->dn = NULL; /* this is needed to prevent the fcntl/close race described below */ - mutex_lock(&dnotify_mark_mutex); + mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); @@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, + NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; @@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed - * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the - * only time we clean up the marks we need to get our mark off - * the list. */ + * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the + * fd is the only time we clean up the marks we need to get our mark + * off the list. */ if (f != filp) { /* if we added ourselves, shoot ourselves, it's possible that * the flush actually did shoot this fsn_mark. That's fine too @@ -385,9 +386,9 @@ out: spin_unlock(&fsn_mark->lock); if (destroy) - fsnotify_destroy_mark(fsn_mark, dnotify_group); + fsnotify_destroy_mark_locked(fsn_mark, dnotify_group); - mutex_unlock(&dnotify_mark_mutex); + mutex_unlock(&dnotify_group->mark_mutex); fsnotify_put_mark(fsn_mark); out_err: if (new_fsn_mark) -- cgit v0.10.2 From e1e5a9f84e4dbd3567bb8b0d5e79db6e1e5ebc35 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:45 -0700 Subject: inotify: fix race when adding a new watch In inotify_new_watch() the number of watches for a group is compared against the max number of allowed watches and increased afterwards. The check and incrementation is not done atomically, so it is possible for multiple concurrent threads to pass the check and increment the number of marks above the allowed max. This patch uses an inotify groups mark_lock to ensure that both check and incrementation are done atomic. Furthermore we dont have to worry about the race that allows a concurrent thread to add a watch just after inotify_update_existing_watch() returned with -ENOENT anymore, since this is also synchronized by the groups mark mutex now. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 959815c..60f954a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, + NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); @@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod { int ret = 0; -retry: + mutex_lock(&group->mark_mutex); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); - /* - * inotify_new_watch could race with another thread which did an - * inotify_new_watch between the update_existing and the add watch - * here, go back and try to update an existing mark again. - */ - if (ret == -EEXIST) - goto retry; + mutex_unlock(&group->mark_mutex); return ret; } -- cgit v0.10.2 From 9756b9187eebb093b9f6a154ecceb67648e53391 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Mon, 8 Jul 2013 15:59:46 -0700 Subject: fsnotify: update comments concerning locking scheme There have been changes in the locking scheme of fsnotify but the comments in the source code have not been updated yet. This patch corrects this. Signed-off-by: Lino Sanfilippo Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc6b49b..923fe4a 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -20,28 +20,29 @@ * fsnotify inode mark locking/lifetime/and refcnting * * REFCNT: - * The mark->refcnt tells how many "things" in the kernel currently are - * referencing this object. The object typically will live inside the kernel - * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task - * which can find this object holding the appropriete locks, can take a reference - * and the object itself is guaranteed to survive until the reference is dropped. + * The group->recnt and mark->refcnt tell how many "things" in the kernel + * currently are referencing the objects. Both kind of objects typically will + * live inside the kernel with a refcnt of 2, one for its creation and one for + * the reference a group and a mark hold to each other. + * If you are holding the appropriate locks, you can take a reference and the + * object itself is guaranteed to survive until the reference is dropped. * * LOCKING: - * There are 3 spinlocks involved with fsnotify inode marks and they MUST - * be taken in order as follows: + * There are 3 locks involved with fsnotify inode marks and they MUST be taken + * in order as follows: * + * group->mark_mutex * mark->lock - * group->mark_lock * inode->i_lock * - * mark->lock protects 2 things, mark->group and mark->inode. You must hold - * that lock to dereference either of these things (they could be NULL even with - * the lock) - * - * group->mark_lock protects the marks_list anchored inside a given group - * and each mark is hooked via the g_list. It also sorta protects the - * free_g_list, which when used is anchored by a private list on the stack of the - * task which held the group->mark_lock. + * group->mark_mutex protects the marks_list anchored inside a given group and + * each mark is hooked via the g_list. It also protects the groups private + * data (i.e group limits). + + * mark->lock protects the marks attributes like its masks and flags. + * Furthermore it protects the access to a reference of the group that the mark + * is assigned to as well as the access to a reference of the inode/vfsmount + * that is being watched by the mark. * * inode->i_lock protects the i_fsnotify_marks list anchored inside a * given inode and each mark is hooked via the i_list. (and sorta the @@ -64,18 +65,11 @@ * inode. We take i_lock and walk the i_fsnotify_marks safely. For each * mark on the list we take a reference (so the mark can't disappear under us). * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; At this point we no - * longer fear anything finding the mark using the inode's list of marks. - * - * We can safely and locklessly run the private list on the stack of everything - * we just unattached from the original inode. For each mark on the private list - * we grab the mark-> and can thus dereference mark->group and mark->inode. If - * we see the group and inode are not NULL we take those locks. Now holding all - * 3 locks we can completely remove the mark from other tasks finding it in the - * future. Remember, 10 things might already be referencing this mark, but they - * better be holding a ref. We drop our reference we took before we unhooked it - * from the inode. When the ref hits 0 we can free the mark. - * + * private list anchored on the stack using i_free_list; we walk i_free_list + * and before we destroy the mark we make sure that we dont race with a + * concurrent destroy_group by getting a ref to the marks group and taking the + * groups mutex. + * Very similarly for freeing by group, except we use free_g_list. * * This has the very interesting property of being able to run concurrently with -- cgit v0.10.2 From 34e3a58c66aafd90cc16c061569fbefc3ff451e9 Mon Sep 17 00:00:00 2001 From: Libo Chen Date: Mon, 8 Jul 2013 15:59:47 -0700 Subject: drivers/iommu/msm_iommu_dev.c: fix leak and clean up error paths Fix two obvious problems: 1. We have registered msm_iommu_driver first, and need unregister it when registered msm_iommu_ctx_driver fail 2. We don't need to kfree drvdata before kzalloc was successful. [akpm@linux-foundation.org: remove now-unneeded initialization of ctx_drvdata, remove unneeded braces] Signed-off-by: Libo Chen Acked-by: David Brown Cc: David Woodhouse Cc: James Hogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c index 9144a6b..6ba3514 100644 --- a/drivers/iommu/msm_iommu_dev.c +++ b/drivers/iommu/msm_iommu_dev.c @@ -291,25 +291,20 @@ static int msm_iommu_ctx_probe(struct platform_device *pdev) { struct msm_iommu_ctx_dev *c = pdev->dev.platform_data; struct msm_iommu_drvdata *drvdata; - struct msm_iommu_ctx_drvdata *ctx_drvdata = NULL; + struct msm_iommu_ctx_drvdata *ctx_drvdata; int i, ret; - if (!c || !pdev->dev.parent) { - ret = -EINVAL; - goto fail; - } - drvdata = dev_get_drvdata(pdev->dev.parent); + if (!c || !pdev->dev.parent) + return -EINVAL; - if (!drvdata) { - ret = -ENODEV; - goto fail; - } + drvdata = dev_get_drvdata(pdev->dev.parent); + if (!drvdata) + return -ENODEV; ctx_drvdata = kzalloc(sizeof(*ctx_drvdata), GFP_KERNEL); - if (!ctx_drvdata) { - ret = -ENOMEM; - goto fail; - } + if (!ctx_drvdata) + return -ENOMEM; + ctx_drvdata->num = c->num; ctx_drvdata->pdev = pdev; @@ -403,6 +398,7 @@ static int __init msm_iommu_driver_init(void) ret = platform_driver_register(&msm_iommu_ctx_driver); if (ret != 0) { + platform_driver_unregister(&msm_iommu_driver); pr_err("Failed to register IOMMU context driver\n"); goto error; } -- cgit v0.10.2 From 9a2458a633d4b3c9e0eae506da40cf44dc075314 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 8 Jul 2013 15:59:48 -0700 Subject: mm: mremap: validate input before taking lock This patch is very similar to commit 84d96d897671 ("mm: madvise: complete input validation before taking lock"): perform some basic validation of the input to mremap() before taking the ¤t->mm->mmap_sem lock. This also makes the MREMAP_FIXED => MREMAP_MAYMOVE dependency slightly more explicit. Signed-off-by: Rasmus Villemoes Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mremap.c b/mm/mremap.c index 3708655..457d34e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; - down_write(¤t->mm->mmap_sem); - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; if (addr & ~PAGE_MASK) - goto out; + return ret; old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len); @@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * a zero new-len is nonsensical. */ if (!new_len) - goto out; + return ret; + + down_write(¤t->mm->mmap_sem); if (flags & MREMAP_FIXED) { - if (flags & MREMAP_MAYMOVE) - ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); goto out; } -- cgit v0.10.2 From 54f72fe022d9b2c4de40043a118881121190a117 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Jul 2013 15:59:49 -0700 Subject: memcg: clean up memcg->nodeinfo Remove struct mem_cgroup_lru_info and fold its single member, the variably sized nodeinfo[0], directly into struct mem_cgroup. This should make it more obvious why it has to be the last member there. Also move the comment that's above that special last member below it, so it is more visible to somebody that considers appending to the struct mem_cgroup. Signed-off-by: Johannes Weiner Cc: David Rientjes Acked-by: Michal Hocko Cc: Glauber Costa Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e851f4..2b7cd24 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -187,10 +187,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[0]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -366,14 +362,8 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif - /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - * - * WARNING: This has to be the last element of the struct. Don't - * add new fields after this point. - */ - struct mem_cgroup_lru_info info; + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ }; static size_t memcg_size(void) @@ -683,7 +673,7 @@ static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { VM_BUG_ON((unsigned)nid >= nr_node_ids); - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -6087,13 +6077,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) -- cgit v0.10.2 From 609838cfed972d49a65aac7923a9ff5cbe482e30 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Jul 2013 15:59:50 -0700 Subject: mm: invoke oom-killer from remaining unconverted page fault handlers A few remaining architectures directly kill the page faulting task in an out of memory situation. This is usually not a good idea since that task might not even use a significant amount of memory and so may not be the optimal victim to resolve the situation. Since 2.6.29's 1c0fe6e ("mm: invoke oom-killer from page fault") there is a hook that architecture page fault handlers are supposed to call to invoke the OOM killer and let it pick the right task to kill. Convert the remaining architectures over to this hook. To have the previous behavior of simply taking out the faulting task the vm.oom_kill_allocating_task sysctl can be set to 1. Signed-off-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Acked-by: David Rientjes Acked-by: Vineet Gupta [arch/arc bits] Cc: James Hogan Cc: David Howells Cc: Jonas Bonn Cc: Chen Liqin Cc: Lennox Wu Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 318164c..0fd1f0d 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -207,8 +207,10 @@ out_of_memory: } up_read(&mm->mmap_sem); - if (user_mode(regs)) - do_group_exit(SIGKILL); /* This will never return */ + if (user_mode(regs)) { + pagefault_out_of_memory(); + return; + } goto no_context; diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 2c75bf7..8fddf46 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -224,8 +224,10 @@ do_sigbus: */ out_of_memory: up_read(&mm->mmap_sem); - if (user_mode(regs)) - do_group_exit(SIGKILL); + if (user_mode(regs)) { + pagefault_out_of_memory(); + return 1; + } no_context: /* Are we prepared to handle this kernel fault? */ diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index d48a84f..8a2e6de 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -345,9 +345,10 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - printk(KERN_ALERT "VM: killing process %s\n", tsk->comm); - if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) - do_exit(SIGKILL); + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) { + pagefault_out_of_memory(); + return; + } goto no_context; do_sigbus: diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index e2bfafc..4a41f84 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -267,10 +267,10 @@ out_of_memory: __asm__ __volatile__("l.nop 1"); up_read(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); - if (user_mode(regs)) - do_exit(SIGKILL); - goto no_context; + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 47b600e..6b18fb0 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -172,10 +172,10 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; } - printk("VM: killing process %s\n", tsk->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - goto no_context; + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); + return; do_sigbus: up_read(&mm->mmap_sem); diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 3d2b81c..f7f99f9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -573,10 +573,10 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; } - pr_alert("VM: killing process %s\n", tsk->comm); - if (!is_kernel_mode) - do_group_exit(SIGKILL); - goto no_context; + if (is_kernel_mode) + goto no_context; + pagefault_out_of_memory(); + return 0; do_sigbus: up_read(&mm->mmap_sem); -- cgit v0.10.2 From 7960aedde8cfa72e4caf488806ea7ea7d2fa8dba Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:52 -0700 Subject: mm: remove duplicated call of get_pfn_range_for_nid When calculating pages in a node, for each zone in that node, we will have zone_spanned_pages_in_node --> get_pfn_range_for_nid zone_absent_pages_in_node --> get_pfn_range_for_nid That is to say, we call the get_pfn_range_for_nid to get start_pfn and end_pfn of the node for MAX_NR_ZONES * 2 times. And this is totally unnecessary if we call the get_pfn_range_for_nid before zone_*_pages_in_node add two extra arguments node_start_pfn and node_end_pfn for zone_*_pages_in_node, then we can remove the get_pfn_range_in_node in zone_*_pages_in_node. [akpm@linux-foundation.org: make definitions more readable] Signed-off-by: Zhang Yanfei Cc: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 327516b..7d5e40f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4421,13 +4421,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4482,14 +4482,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4502,6 +4502,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4509,6 +4511,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4520,21 +4524,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4643,6 +4653,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4664,8 +4675,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4779,6 +4793,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4786,7 +4802,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4795,7 +4815,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v0.10.2 From ab15d9b4cbc2b6497023f554a152c2573ca53671 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 15:59:53 -0700 Subject: mm/vmalloc.c: unbreak __vunmap() There is an extra semi-colon so the function always returns. Signed-off-by: Dan Carpenter Acked-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91a1047..96b77a9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1453,7 +1453,7 @@ static void __vunmap(const void *addr, int deallocate_pages) return; if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)); + addr)) return; area = remove_vm_area(addr); -- cgit v0.10.2 From 3fcd76e8028e0be37b02a2002b4f56755daeda06 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:54 -0700 Subject: mm/vmalloc.c: remove dead code in vb_alloc Space in a vmap block that was once allocated is considered dirty and not made available for allocation again before the whole block is recycled. The result is that free space within a vmap block is always contiguous. So if a vmap block has enough free space for allocation, the allocation is impossible to fail. Thus, the fragmented block purging was never invoked from vb_alloc(). So remove this dead code. [ Same patches also sent by: Chanho Min Johannes Weiner but git doesn't do "multiple authors" ] Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 96b77a9..a35f4f5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -910,7 +910,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; unsigned long addr = 0; unsigned int order; - int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -934,17 +933,7 @@ again: if (vb->free < 1UL << order) goto next; - i = bitmap_find_free_region(vb->alloc_map, - VMAP_BBMAP_BITS, order); - - if (i < 0) { - if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { - /* fragmented and no outstanding allocations */ - BUG_ON(vb->dirty != VMAP_BBMAP_BITS); - purge = 1; - } - goto next; - } + i = VMAP_BBMAP_BITS - vb->free; addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); @@ -960,9 +949,6 @@ next: spin_unlock(&vb->lock); } - if (purge) - purge_fragmented_blocks_thiscpu(); - put_cpu_var(vmap_block_queue); rcu_read_unlock(); -- cgit v0.10.2 From 9da3f59fbdb57c9447ddb42681f6ab98faef353a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:55 -0700 Subject: mm/vmalloc.c: remove unused purge_fragmented_blocks_thiscpu This function is nowhere used now, so remove it. Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a35f4f5..99d045a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -891,11 +891,6 @@ static void purge_fragmented_blocks(int cpu) } } -static void purge_fragmented_blocks_thiscpu(void) -{ - purge_fragmented_blocks(smp_processor_id()); -} - static void purge_fragmented_blocks_allcpus(void) { int cpu; -- cgit v0.10.2 From b8e748b6c32999f221ea4786557b8e7e6c4e4e7a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:56 -0700 Subject: mm/vmalloc.c: remove alloc_map from vmap_block As we have removed the dead code in the vb_alloc, it seems there is no place to use the alloc_map. So there is no reason to maintain the alloc_map in vmap_block. Signed-off-by: Zhang Yanfei Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 99d045a..7ac2a1f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -754,7 +754,6 @@ struct vmap_block { struct vmap_area *va; struct vmap_block_queue *vbq; unsigned long free, dirty; - DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; @@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; - bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); -- cgit v0.10.2 From 46c001a2753f47ffa621131baa3409e636515347 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:57 -0700 Subject: mm/vmalloc.c: emit the failure message before return Use goto to jump to the fail label to give a failure message before returning NULL. This makes the failure handling in this function consistent. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ac2a1f..d81b9f7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1642,7 +1642,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNLIST flag. -- cgit v0.10.2 From 20fc02b477c526c6a85f84e3770373778ff2f97e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:58 -0700 Subject: mm/vmalloc.c: rename VM_UNLIST to VM_UNINITIALIZED VM_UNLIST was used to indicate that the vm_struct is not listed in vmlist. But after commit 4341fa454796 ("mm, vmalloc: remove list management of vmlist after initializing vmalloc"), the meaning of this flag changed. It now means the vm_struct is not fully initialized. So renaming it to VM_UNINITIALIZED seems more reasonable. Also change clear_vm_unlist to clear_vm_uninitialized_flag. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dd0a2c8..4b8a891 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -10,12 +10,12 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ /* bits in flags of vmalloc's vm_struct below */ -#define VM_IOREMAP 0x00000001 /* ioremap() and friends */ -#define VM_ALLOC 0x00000002 /* vmalloc() */ -#define VM_MAP 0x00000004 /* vmap()ed pages */ -#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ -#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ -#define VM_UNLIST 0x00000020 /* vm_struct is not listed in vmlist */ +#define VM_IOREMAP 0x00000001 /* ioremap() and friends */ +#define VM_ALLOC 0x00000002 /* vmalloc() */ +#define VM_MAP 0x00000004 /* vmap()ed pages */ +#define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ +#define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ +#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d81b9f7..af40068 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1289,15 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, spin_unlock(&vmap_area_lock); } -static void clear_vm_unlist(struct vm_struct *vm) +static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* - * Before removing VM_UNLIST, + * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); - vm->flags &= ~VM_UNLIST; + vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, @@ -1635,7 +1635,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, start, end, node, gfp_mask, caller); if (!area) goto fail; @@ -1645,11 +1645,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; /* - * In this function, newly allocated vm_struct has VM_UNLIST flag. - * It means that vm_struct is not fully initialized. + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ - clear_vm_unlist(area); + clear_vm_uninitialized_flag(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area @@ -2569,9 +2569,9 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_unlist() */ + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ smp_rmb(); - if (v->flags & VM_UNLIST) + if (v->flags & VM_UNINITIALIZED) return; memset(counters, 0, nr_node_ids * sizeof(unsigned int)); -- cgit v0.10.2 From d157a55815ffff48caec311dfb543ce8a79e283e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 15:59:59 -0700 Subject: mm/vmalloc.c: check VM_UNINITIALIZED flag in s_show instead of show_numa_info We should check the VM_UNITIALIZED flag in s_show(). If this flag is set, that said, the vm_struct is not fully initialized. So it is unnecessary to try to show the information contained in vm_struct. We checked this flag in show_numa_info(), but I think it's better to check it earlier. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af40068..318c500 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2569,11 +2569,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - if (v->flags & VM_UNINITIALIZED) - return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2602,6 +2597,11 @@ static int s_show(struct seq_file *m, void *p) v = va->vm; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return 0; + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); -- cgit v0.10.2 From 6d42c232bd1e77288b2660153299b7d12a5c8e15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:00 -0700 Subject: memcg: also test for skip accounting at the page allocation level The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. This patch (of 2): Disabling accounting is only relevant for some specific memcg internal allocations. Therefore we would initially not have such check at memcg_kmem_newpage_charge, since direct calls to the page allocator that are marked with GFP_KMEMCG only happen outside memcg core. We are mostly concerned with cache allocations and by having this test at memcg_kmem_get_cache we are already able to relay the allocation to the root cache and bypass the memcg caches altogether. There is one exception, though: the SLUB allocator does not create large order caches, but rather service large kmallocs directly from the page allocator. Therefore, the following sequence, when backed by the SLUB allocator: memcg_stop_kmem_account(); kmalloc() memcg_resume_kmem_account(); would effectively ignore the fact that we should skip accounting, since it will drive us directly to this function without passing through the cache selector memcg_kmem_get_cache. Such large allocations are extremely rare but can happen, for instance, for the cache arrays. This was never a problem in practice, because we weren't skipping accounting for the cache arrays. All the allocations we were skipping were fairly small. However, the fact that we were not skipping those allocations are a problem and can prevent the memcgs from going away. As we fix that, we need to make sure that the fix will also work with the SLUB allocator. Signed-off-by: Glauber Costa Reported-by: Michal Hocko Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b7cd24..06a595f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3637,6 +3637,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) int ret; *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc() + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + memcg = try_get_mem_cgroup_from_mm(current->mm); /* -- cgit v0.10.2 From 425c598d583883c33c75780225ba8e0794b43bd9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:01 -0700 Subject: memcg: do not account memory used for cache creation The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. Signed-off-by: Glauber Costa Cc: Johannes Weiner Cc: Michal Hocko Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06a595f..64f7265 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); + memcg_stop_kmem_account(); ret = memcg_update_cache_sizes(memcg); + memcg_resume_kmem_account(); mutex_unlock(&set_limit_mutex); out: return ret; -- cgit v0.10.2 From 537926caedb335b198eb53930ebeeb6426a541f9 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:02 -0700 Subject: include/linux/gfp.h: fix the comment for GFP_ZONE_TABLE 0xc just means MOVABLE + DMA32, which results in zone DMA32. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0f615eb..9b4dd49 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -209,7 +209,7 @@ static inline int allocflags_to_migratetype(gfp_t gfp_flags) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) - * 0xc => DMA32 (MOVABLE+HIGHMEM+DMA32) + * 0xc => DMA32 (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) -- cgit v0.10.2 From 0cf31ec10e92253e2908cd830145a71043740d77 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Mon, 8 Jul 2013 16:00:05 -0700 Subject: MAINTAINERS: add zswap and zbud maintainer Add maintainer information for zswap and zbud into the MAINTAINERS file. Signed-off-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 97762ad..70cf679 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9263,6 +9263,13 @@ F: Documentation/networking/z8530drv.txt F: drivers/net/hamradio/*scc.c F: drivers/net/hamradio/z8530.h +ZBUD COMPRESSED PAGE ALLOCATOR +M: Seth Jennings +L: linux-mm@kvack.org +S: Maintained +F: mm/zbud.c +F: include/linux/zbud.h + ZD1211RW WIRELESS DRIVER M: Daniel Drake M: Ulrich Kunitz @@ -9285,6 +9292,12 @@ M: "Maciej W. Rozycki" S: Maintained F: drivers/tty/serial/zs.* +ZSWAP COMPRESSED SWAP CACHING +M: Seth Jennings +L: linux-mm@kvack.org +S: Maintained +F: mm/zswap.c + THE REST M: Linus Torvalds L: linux-kernel@vger.kernel.org -- cgit v0.10.2 From bc732f1d55cf41627ee4c64078812b2fa592b394 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:06 -0700 Subject: mm/page_alloc.c: remove zone_type argument of build_zonelists_node The callers of build_zonelists_node always pass MAX_NR_ZONES -1 as the zone_type argument, so we can directly use the value in build_zonelists_node and remove zone_type argument. Signed-off-by: Zhang Yanfei Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d5e40f..27f9d4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3153,12 +3153,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3168,8 +3166,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3363,8 +3361,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3378,7 +3375,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3586,7 +3583,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3599,14 +3596,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; -- cgit v0.10.2 From b21fbccd4b8aba805cbc231998ec7bf83616a79e Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:07 -0700 Subject: mm: remove unused functions is_{normal_idx, normal, dma32, dma} These functions are nowhere used, so remove them. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ae19af5..af4a3b7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -869,11 +869,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -static inline int is_normal_idx(enum zone_type idx) -{ - return (idx == ZONE_NORMAL); -} - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -892,29 +887,6 @@ static inline int is_highmem(struct zone *zone) #endif } -static inline int is_normal(struct zone *zone) -{ - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; -} - -static inline int is_dma32(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA32 - return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; -#else - return 0; -#endif -} - -static inline int is_dma(struct zone *zone) -{ -#ifdef CONFIG_ZONE_DMA - return zone == zone->zone_pgdat->node_zones + ZONE_DMA; -#else - return 0; -#endif -} - /* These two functions are used to setup the per zone pages min values */ struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, -- cgit v0.10.2 From 345606d42971fc4ed164fbabac118708d51b8e0a Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: mm/page_alloc.c: remove unlikely() from the current_order test In __rmqueue_fallback(), current_order loops down from MAX_ORDER - 1 to the order passed. MAX_ORDER is typically 11 and pageblock_order is typically 9 on x86. Integer division truncates, so pageblock_order / 2 is 4. For the first eight iterations, it's guaranteed that current_order >= pageblock_order / 2 if it even gets that far! So just remove the unlikely(), it's completely bogus. Signed-off-by: Zhang Yanfei Suggested-by: David Rientjes Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27f9d4b..b5855e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1046,7 +1046,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; -- cgit v0.10.2 From 59d3132f8abdc18301898febf205d00db5f0458c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:08 -0700 Subject: vfree: don't schedule free_work() if llist_add() returns false vfree() only needs schedule_work(&p->wq) if p->list was empty, otherwise vfree_deferred->wq is already pending or it is running and didn't do llist_del_all() yet. Signed-off-by: Oleg Nesterov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 318c500..a649186 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1477,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages) * conventions for vfree() arch-depenedent would be a really bad idea) * * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) - * */ void vfree(const void *addr) { @@ -1489,8 +1488,8 @@ void vfree(const void *addr) return; if (unlikely(in_interrupt())) { struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); - llist_add((struct llist_node *)addr, &p->list); - schedule_work(&p->wq); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } else __vunmap(addr, 1); } -- cgit v0.10.2 From 929aaf56958ab2300919653b923413af695470a5 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:09 -0700 Subject: mm: remove unused __put_page() This function is nowhere used, and it has a confusing name with put_page in mm/swap.c. So better to remove it. Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index 8562de0..4390ac6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __put_page(struct page *page) -{ - atomic_dec(&page->_count); -} - static inline void __get_page_tail_foll(struct page *page, bool get_page_head) { -- cgit v0.10.2 From f3deb6872b946a851a3799b315f3c85ce4c027fc Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:10 -0700 Subject: mm/sparse.c: put clear_hwpoisoned_pages within CONFIG_MEMORY_HOTREMOVE With CONFIG_MEMORY_HOTREMOVE unset, there is a compile warning: mm/sparse.c:755: warning: `clear_hwpoisoned_pages' defined but not used And Bisecting it ended up pointing to 4edd7ceff ("mm, hotplug: avoid compiling memory hotremove functions when disabled"). This is because the commit above put sparse_remove_one_section() within the protection of CONFIG_MEMORY_HOTREMOVE but the only user of clear_hwpoisoned_pages() is sparse_remove_one_section(), and it is not within the protection of CONFIG_MEMORY_HOTREMOVE. So put clear_hwpoisoned_pages within CONFIG_MEMORY_HOTREMOVE should fix the warning. Signed-off-by: Zhang Yanfei Cc: David Rientjes Acked-by: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index b38400f..308d503 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -753,6 +753,7 @@ out: return ret; } +#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -774,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; -- cgit v0.10.2 From 12057841008534236e52df3d3e63e089f27c5406 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 8 Jul 2013 16:00:11 -0700 Subject: fs/fs-writeback.c: : make wb_do_writeback() as static It's not used globally and could be static. Signed-off-by: Haicheng Li Cc: Jan Kara Cc: Wu Fengguang Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a85ac4e..aca8835 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index abfe117..e0efffa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -95,7 +95,6 @@ int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, void sync_inodes_sb(struct super_block *); long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason); -long wb_do_writeback(struct bdi_writeback *wb, int force_wait); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); -- cgit v0.10.2 From 6ce1bc86ae8b8f74095f2694732ccbab2f3849e5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:12 -0700 Subject: mm/writeback: remove wb_reason_name wb_reason_name is not used any more - remove it. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Reviewed-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e0efffa..e1703de 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -51,7 +51,6 @@ enum wb_reason { WB_REASON_MAX, }; -extern const char *wb_reason_name[]; /* * A control structure which tells the writeback code what to do. These are -- cgit v0.10.2 From 25d130ba22362757a90135fd8a0f75cc7fc71e79 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:14 -0700 Subject: mm/writeback: don't check force_wait to handle bdi->work_list After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), bdi_writeback_workfn runs off bdi_writeback->dwork, on each execution, it processes bdi->work_list and reschedules if there are more things to do instead of flush any work that race with us existing. It is unecessary to check force_wait in wb_do_writeback since it is always 0 after the mentioned commit. This patch remove the force_wait in wb_do_writeback. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Reviewed-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aca8835..68851ff 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -963,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) /* * Retrieve work items and do the writeback they describe */ -static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +static long wb_do_writeback(struct bdi_writeback *wb) { struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; @@ -971,12 +971,6 @@ static long wb_do_writeback(struct bdi_writeback *wb, int force_wait) set_bit(BDI_writeback_running, &wb->bdi->state); while ((work = get_next_work_item(bdi)) != NULL) { - /* - * Override sync mode, in case we must wait for completion - * because this thread is exiting now. - */ - if (force_wait) - work->sync_mode = WB_SYNC_ALL; trace_writeback_exec(bdi, work); @@ -1025,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work) * rescuer as work_list needs to be drained. */ do { - pages_written = wb_do_writeback(wb, 0); + pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&bdi->work_list)); } else { -- cgit v0.10.2 From fc6df808aaf00eed564e2e7fc0f246691363cd12 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:15 -0700 Subject: mm/writeback: commit reason of WB_REASON_FORKER_THREAD mismatch name After commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), there is no bdi forker thread any more. However, WB_REASON_FORKER_THREAD is still used due to it is TPs userland visible and we won't be exposing exactly the same information with just a different name. Signed-off-by: Wanpeng Li Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e1703de..4e198ca 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -47,6 +47,12 @@ enum wb_reason { WB_REASON_LAPTOP_TIMER, WB_REASON_FREE_MORE_MEM, WB_REASON_FS_FREE_SPACE, + /* + * There is no bdi forker thread any more and works are done + * by emergency worker, however, this is TPs userland visible + * and we'll be exposing exactly the same information, + * so it has a mismatch name. + */ WB_REASON_FORKER_THREAD, WB_REASON_MAX, -- cgit v0.10.2 From f8f191f1addf0b31f188fd88e71e97200871c99c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:16 -0700 Subject: mm/page_alloc: fix doc for numa_zonelist_order The default zonelist order selecter will select "node" order if any nodes DMA zone comprises greater than 70% of its local memory instead of 60%, according to default_zonelist_order::low_kmem_size > total * 70/100. Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9..36ecc26 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -510,7 +510,7 @@ Specify "[Dd]efault" to request automatic configuration. Autoconfiguration will select "node" order in following case. (1) if the DMA zone does not exist or (2) if the DMA zone comprises greater than 50% of the available memory or -(3) if any node's DMA zone comprises greater than 60% of its local memory and +(3) if any node's DMA zone comprises greater than 70% of its local memory and the amount of local memory is big enough. Otherwise, "zone" order will be selected. Default order is recommended unless -- cgit v0.10.2 From f49cbdde495f62e1c2d906b16e833cec27de5e59 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:16 -0700 Subject: mm/thp: fix doc for transparent huge zero page Transparent huge zero page is used during the page fault instead of in khugepaged. # ls /sys/kernel/mm/transparent_hugepage/ defrag enabled khugepaged use_zero_page # ls /sys/kernel/mm/transparent_hugepage/khugepaged/ alloc_sleep_millisecs defrag full_scans max_ptes_none pages_collapsed pages_to_scan scan_sleep_millisecs This patch corrects the documentation just like the codes done. Signed-off-by: Wanpeng Li Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8785fb8..4a63953 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -120,8 +120,8 @@ By default kernel tries to use huge zero page on read page fault. It's possible to disable huge zero page by writing 0 or enable it back by writing 1: -echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page -echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page +echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page +echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll -- cgit v0.10.2 From 73b44ff43c4b3cf517826da03c51948593f88753 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 8 Jul 2013 16:00:17 -0700 Subject: mm/pgtable: don't accumulate addr during pgd prepopulate pmd The old codes accumulate addr to get right pmd, however, currently pmds are preallocated and transfered as a parameter, there is unnecessary to accumulate addr variable any more, this patch remove it. Signed-off-by: Wanpeng Li Reviewed-by: Michal Hocko Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17fda6a..dfa537a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -240,7 +240,6 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { pud_t *pud; - unsigned long addr; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ @@ -248,8 +247,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) pud = pud_offset(pgd, 0); - for (addr = i = 0; i < PREALLOCATED_PMDS; - i++, pud++, addr += PUD_SIZE) { + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) -- cgit v0.10.2 From 64363aad5ff1b878230e91223038c26a2205bff3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 8 Jul 2013 16:00:18 -0700 Subject: mm: remove unused VM_ macros and expand other in-place These VM_ macros aren't used very often and three of them aren't used at all. Expand the ones that are used in-place, and remove all the now unused #define VM_ macros. VM_READHINTMASK, VM_NormalReadHint and VM_ClearReadHint were added just before 2.4 and appears have never been used. Signed-off-by: Joe Perches Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index b87681a..f022460 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -151,12 +151,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) #endif -#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) -#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK -#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) -#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) -#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) - /* * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. diff --git a/mm/filemap.c b/mm/filemap.c index 7905fe7..4b51ac1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma)) { + if (vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; diff --git a/mm/memory.c b/mm/memory.c index b68812d..1ce2e2a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1150,7 +1150,7 @@ again: if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) + likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } diff --git a/mm/rmap.c b/mm/rmap.c index e22ceeb..cd356df 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, * mapping is already gone, the unmap path will have * set PG_referenced or activated the page. */ - if (likely(!VM_SequentialReadHint(vma))) + if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } pte_unmap_unlock(pte, ptl); -- cgit v0.10.2 From bcb615a81b1765864c71c50afb56631e7a1e5283 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Mon, 8 Jul 2013 16:00:19 -0700 Subject: mm/vmalloc.c: fix an overflow bug in alloc_vmap_area() When searching a vmap area in the vmalloc space, we use (addr + size - 1) to check if the value is less than addr, which is an overflow. But we assign (addr + size) to vmap_area->va_end. So if we come across the below case: (addr + size - 1) : not overflow (addr + size) : overflow we will assign an overflow value (e.g 0) to vmap_area->va_end, And this will trigger BUG in __insert_vmap_area, causing system panic. So using (addr + size) to check the overflow should be the correct behaviour, not (addr + size - 1). Signed-off-by: Zhang Yanfei Reported-by: Ghennadi Procopciuc Tested-by: Daniel Baluta Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a649186..13a5495 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -388,12 +388,12 @@ nocache: addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; @@ -420,7 +420,7 @@ nocache: if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list)) -- cgit v0.10.2 From ef277c73ca3b1aade278036ae11640090681d558 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:21 -0700 Subject: page migration: fix wrong comment in address_space_operations.migratepage() There is no parameter "sync" in address_space_operations->migratepage(). It should be migrate_mode. And the comment is for MIGRATE_ASYNC. Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/fs.h b/include/linux/fs.h index 99be011..cb771ec 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -372,8 +372,8 @@ struct address_space_operations { int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); /* - * migrate the contents of a page to the specified target. If sync - * is false, it must not block. + * migrate the contents of a page to the specified target. If + * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); -- cgit v0.10.2 From d8bbdd773d64b30b6b36f027ad2e182ed2045f3c Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:22 -0700 Subject: mm/memblock.c: fix wrong comment in __next_free_mem_range() Remove one redundant "nid" in the comment. Signed-off-by: Tang Chen Signed-off-by: Linus Torvalds diff --git a/mm/memblock.c b/mm/memblock.c index c5fad93..a847bfe6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %MAX_NUMNODES for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL -- cgit v0.10.2 From 7e9f5eb03d3762ec89dda1888c774ae7b4040af7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 8 Jul 2013 16:00:23 -0700 Subject: mm/memory_hotplug.c: fix a comment typo in register_page_bootmem_info_node() Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f5ba127..cd2990f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -208,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); - /* register_section info */ + /* register section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { /* * Some platforms can assign the same pfn to multiple nodes - on * node0 as well as nodeN. To avoid registering a pfn against * multiple nodes we check that this pfn does not already - * reside in some other node. + * reside in some other nodes. */ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); -- cgit v0.10.2 From 5a1c9cbc1550f93335d7c03eb6c271e642deff04 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Jul 2013 16:00:24 -0700 Subject: mm: vmscan: do not continue scanning if reclaim was aborted for compaction Direct reclaim is not aborting to allow compaction to go ahead properly. do_try_to_free_pages is told to abort reclaim which is happily ignores and instead increases priority instead until it reaches 0 and starts shrinking file/anon equally. This patch corrects the situation by aborting reclaim when requested instead of raising priority. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Dave Chinner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b3ac7..2385663 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2361,8 +2361,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, aborted_reclaim = shrink_zones(zonelist, sc); /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups + * Don't shrink slabs when reclaiming memory from over limit + * cgroups but do shrink slab at least once when aborting + * reclaim for compaction to avoid unevenly scanning file/anon + * LRU pages over slab pages. */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; @@ -2404,7 +2406,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - } while (--sc->priority >= 0); + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); -- cgit v0.10.2 From 918fc718c5922520c499ad60f61b8df86b998ae9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 8 Jul 2013 16:00:25 -0700 Subject: mm: vmscan: do not scale writeback pages when deciding whether to set ZONE_WRITEBACK After the patch "mm: vmscan: Flatten kswapd priority loop" was merged the scanning priority of kswapd changed. The priority now rises until it is scanning enough pages to meet the high watermark. shrink_inactive_list sets ZONE_WRITEBACK if a number of pages were encountered under writeback but this value is scaled based on the priority. As kswapd frequently scans with a higher priority now it is relatively easy to set ZONE_WRITEBACK. This patch removes the scaling and treates writeback pages similar to how it treats unqueued dirty pages and congested pages. The user-visible effect should be that kswapd will writeback fewer pages from reclaim context. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Dave Chinner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 2385663..2cff0d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1443,25 +1443,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback - * * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number * of pages under pages flagged for immediate reclaim and stall if any * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (nr_writeback && nr_writeback == nr_taken) zone_set_flag(zone, ZONE_WRITEBACK); /* -- cgit v0.10.2 From 493af578040e690f93f0fc8d9e7667ffff8155bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Engel?= Date: Mon, 8 Jul 2013 16:00:26 -0700 Subject: mmap: allow MAP_HUGETLB for hugetlbfs files v2 It is counterintuitive at best that mmap'ing a hugetlbfs file with MAP_HUGETLB fails, while mmap'ing it without will a) succeed and b) return huge pages. v2: use is_file_hugepages(), as suggested by Jianguo Signed-off-by: Joern Engel Cc: Jianguo Wu Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 8468ffd..0718c17 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1358,13 +1358,14 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & @@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: -- cgit v0.10.2 From fa460c2d37870e0a6f94c70e8b76d05ca11b6db0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:27 -0700 Subject: Revert "memcg: avoid dangling reference count in creation failure" This reverts commit e4715f01be697a. mem_cgroup_put is hierarchy aware so mem_cgroup_put(memcg) already drops an additional reference from all parents so the additional mem_cgrroup_put(parent) potentially causes use-after-free. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64f7265..6b73d86 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6352,8 +6352,6 @@ mem_cgroup_css_online(struct cgroup *cont) * call __mem_cgroup_free, so return directly */ mem_cgroup_put(memcg); - if (parent->use_hierarchy) - mem_cgroup_put(parent); } return error; } -- cgit v0.10.2 From f37a96914d1aea10fed8d9af10251f0b9caea31b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:29 -0700 Subject: memcg, kmem: fix reference count handling on the error path mem_cgroup_css_online calls mem_cgroup_put if memcg_init_kmem fails. This is not correct because only memcg_propagate_kmem takes an additional reference while mem_cgroup_sockets_init is allowed to fail as well (although no current implementation fails) but it doesn't take any reference. This all suggests that it should be memcg_propagate_kmem that should clean up after itself so this patch moves mem_cgroup_put over there. Unfortunately this is not that easy (as pointed out by Li Zefan) because memcg_kmem_mark_dead marks the group dead (KMEM_ACCOUNTED_DEAD) if it is marked active (KMEM_ACCOUNTED_ACTIVE) which is the case even if memcg_propagate_kmem fails so the additional reference is dropped in that case in kmem_cgroup_destroy which means that the reference would be dropped two times. The easiest way then would be to simply remove mem_cgrroup_put from mem_cgroup_css_online and rely on kmem_cgroup_destroy doing the right thing. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.8] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b73d86..bdeb82c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6345,14 +6345,6 @@ mem_cgroup_css_online(struct cgroup *cont) error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - } return error; } -- cgit v0.10.2 From 5347e5ae13710420eebbbd0b22c045685704da80 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:30 -0700 Subject: memcg: use css_get() in sock_update_memcg() Use css_get/css_put instead of mem_cgroup_get/put. Note, if at the same time someone is moving @current to a different cgroup and removing the old cgroup, css_tryget() may return false, and sock->sk_cgrp won't be initialized, which is fine. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdeb82c..4c31a21a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -551,15 +551,15 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { - mem_cgroup_get(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -573,7 +573,7 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } -- cgit v0.10.2 From 20f05310ba62d5816fb339d08effe78683137197 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:31 -0700 Subject: memcg: don't use mem_cgroup_get() when creating a kmemcg cache Use css_get()/css_put() instead of mem_cgroup_get()/mem_cgroup_put(). There are two things being done in the current code: First, we acquired a css_ref to make sure that the underlying cgroup would not go away. That is a short lived reference, and it is put as soon as the cache is created. At this point, we acquire a long-lived per-cache memcg reference count to guarantee that the memcg will still be alive. so it is: enqueue: css_get create : memcg_get, css_put destroy: memcg_put So we only need to get rid of the memcg_get, change the memcg_put to css_put, and get rid of the now extra css_put. (This changelog is mostly written by Glauber) Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c31a21a..80175de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3242,7 +3242,7 @@ void memcg_release_cache(struct kmem_cache *s) list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); - mem_cgroup_put(memcg); + css_put(&memcg->css); out: kfree(s->memcg_params); } @@ -3402,16 +3402,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cachep->memcg_params->memcg_caches[idx]; - if (new_cachep) + if (new_cachep) { + css_put(&memcg->css); goto out; + } new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) { new_cachep = cachep; + css_put(&memcg->css); goto out; } - mem_cgroup_get(memcg); atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; @@ -3499,8 +3501,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); - /* Drop the reference gotten when we enqueued. */ - css_put(&cw->memcg->css); kfree(cw); } -- cgit v0.10.2 From 10d5ebf40ff09db03b97cb177f24b9c7c8b4bb52 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:33 -0700 Subject: memcg: use css_get/put when charging/uncharging kmem Use css_get/put instead of mem_cgroup_get/put. We can't do a simple replacement, because here mem_cgroup_put() is called during mem_cgroup_css_free(), while mem_cgroup_css_free() won't be called until css refcnt goes down to 0. Instead we increment css refcnt in mem_cgroup_css_offline(), and then check if there's still kmem charges. If not, css refcnt will be decremented immediately, otherwise the refcnt will be released after the last kmem allocation is uncahred. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Tejun Heo Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80175de..bdc9582 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -406,6 +406,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); } @@ -3050,8 +3055,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) if (res_counter_uncharge(&memcg->kmem, size)) return; + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -5183,14 +5196,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * starts accounting before all call sites are patched */ memcg_kmem_set_active(memcg); - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes, so it is unfeasible to migrate them away. We - * need to reference count the memcg because of that. - */ - mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -5223,12 +5228,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; /* - * destroy(), called if we fail, will issue static_key_slow_inc() and - * mem_cgroup_put() if kmem is enabled. We have to either call them - * unconditionally, or clear the KMEM_ACTIVE flag. I personally find - * this more consistent, since it always leads to the same destroy path + * __mem_cgroup_free() will issue static_key_slow_dec() because this + * memcg is active already. If the later initialization fails then the + * cgroup core triggers the cleanup so we do not have to do it here. */ - mem_cgroup_get(memcg); static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); @@ -5913,23 +5916,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) +{ + if (!memcg_kmem_is_active(memcg)) + return; + + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. + */ + css_get(&memcg->css); memcg_kmem_mark_dead(memcg); if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) return; - /* - * Charges already down to 0, undo mem_cgroup_get() done in the charge - * path here, being careful not to race with memcg_uncharge_kmem: it is - * possible that the charges went down to 0 between mark_dead and the - * res_counter read, so in that case, we don't need the put - */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -5937,7 +5960,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif @@ -6370,6 +6397,8 @@ static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_css_offline(memcg); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); @@ -6379,9 +6408,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(memcg); - - mem_cgroup_put(memcg); + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU -- cgit v0.10.2 From 4050377b509b326c14b275fedb2f69b46f37a7a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:34 -0700 Subject: memcg: use css_get/put for swap memcg Use css_get/put instead of mem_cgroup_get/put. A simple replacement will do. The historical reason that memcg has its own refcnt instead of always using css_get/put, is that cgroup couldn't be removed if there're still css refs, so css refs can't be used as long-lived reference. The situation has changed so that rmdir a cgroup will succeed regardless css refs, but won't be freed until css refs goes down to 0. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdc9582..76c0c99 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4231,12 +4231,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } /* * Migration does not charge the res_counter for the @@ -4348,7 +4348,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); @@ -4379,7 +4379,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -4413,11 +4413,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); + css_get(&to->css); return 0; } return -EINVAL; @@ -6718,6 +6721,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -6738,7 +6742,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -6748,7 +6754,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); -- cgit v0.10.2 From 8d76a9797882fc517d87e2b5db2a4f04edaeccec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:36 -0700 Subject: memcg: don't need to get a reference to the parent The cgroup core guarantees it's always safe to access the parent. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76c0c99..c508258 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -503,7 +503,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); static inline @@ -6239,19 +6238,10 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (atomic_sub_and_test(count, &memcg->refcnt)) call_rcu(&memcg->rcu_freeing, free_rcu); - if (parent) - mem_cgroup_put(parent); - } } static void mem_cgroup_put(struct mem_cgroup *memcg) @@ -6354,12 +6344,9 @@ mem_cgroup_css_online(struct cgroup *cont) res_counter_init(&memcg->kmem, &parent->kmem); /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). + * No need to take a reference to the parent because cgroup + * core guarantees its existence. */ - mem_cgroup_get(parent); } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); -- cgit v0.10.2 From e0743e6bc5b7587dd0bfa902d67d3f81ef3f6618 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:37 -0700 Subject: memcg: kill memcg refcnt Now memcg has the same life cycle as its corresponding cgroup. Kill the useless refcnt. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c508258..fa521a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -299,8 +299,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t refcnt; - int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -503,8 +501,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_put(struct mem_cgroup *memcg); - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { @@ -6238,17 +6234,6 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) - call_rcu(&memcg->rcu_freeing, free_rcu); -} - -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6308,7 +6293,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -6399,7 +6383,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); } #ifdef CONFIG_MMU -- cgit v0.10.2 From 465939a1fa283cf2a5194362c5accf4429c99c42 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:38 -0700 Subject: memcg: don't need to free memcg via RCU or workqueue Now memcg has the same life cycle with its corresponding cgroup, and a cgroup is freed via RCU and then mem_cgroup_css_free() will be called in a work function, so we can simply call __mem_cgroup_free() in mem_cgroup_css_free(). This actually reverts commit 59927fb984d ("memcg: free mem_cgroup by RCU to fix oops"). Signed-off-by: Li Zefan Cc: Hugh Dickins Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa521a2..d12ca6f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -263,28 +263,10 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * We also need some space for a worker in deferred freeing. - * By the time we call it, rcu_freeing is no longer in use. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * the counter to account for kernel memory usage. @@ -6211,29 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) vfree(memcg); } - -/* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void free_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - __mem_cgroup_free(memcg); -} - -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6383,7 +6342,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - call_rcu(&memcg->rcu_freeing, free_rcu); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU -- cgit v0.10.2 From 5f12733e9d976132e6cbbae9d08f71406fdacdfb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:40 -0700 Subject: mm: honor min_free_kbytes set by user min_free_kbytes is updated during memory hotplug (by init_per_zone_wmark_min) currently which is right thing to do in most cases but this could be unexpected if admin increased the value to prevent from allocation failures and the new min_free_kbytes would be decreased as a result of memory hotadd. This patch saves the user defined value and allows updating min_free_kbytes only if it is higher than the saved one. A warning is printed when the new value is ignored. Signed-off-by: Michal Hocko Cc: Mel Gorman Acked-by: Zhang Yanfei Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5855e5..b100255 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -5589,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5614,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } -- cgit v0.10.2 From 0a1be15097a5f5ee8cbaf7cf0a55146363db0e4d Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 8 Jul 2013 16:00:41 -0700 Subject: mm/memory_hotplug.c: fix return value of online_pages() online_pages() is called from memory_block_action() when a user requests to online a memory block via sysfs. This function needs to return a proper error value in case of error. Signed-off-by: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cd2990f..ca1dd3a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -914,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && !can_online_high_movable(zone)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } -- cgit v0.10.2 From dcb6b45254e2281b6f99ea7f2d51343954aa3ba8 Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 8 Jul 2013 16:00:42 -0700 Subject: panic: add cpu/pid to warn_slowpath_common in WARNING printk()s Add the cpu/pid that called WARN() so that the stack traces can be matched up with the WARNING messages. [akpm@linux-foundation.org: remove stray quote] Signed-off-by: Alex Thorlton Reviewed-by: Robin Holt Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/panic.c b/kernel/panic.c index 167ec09..9771231 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -399,8 +399,9 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + pr_warn("------------[ cut here ]------------\n"); + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", + raw_smp_processor_id(), current->pid, file, line, caller); if (args) vprintk(args->fmt, args->args); -- cgit v0.10.2 From c707a81de71a27a499fde60fbb963f60602c1a94 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 8 Jul 2013 16:00:43 -0700 Subject: checkpatch: make the CamelCase cache work for non-git trees too Might as well check include timestamps and cache the include file CamelCase uses for the non-git case too. The camelcase cache file is now named: for git: .checkpatch-camelcase.git. for non-git: .checkpatch-camelcase.date. All .checkpatch-camelcase* files are deleted if not current. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6afcd12..2ee9eb7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6,6 +6,7 @@ # Licensed under the terms of the GNU GPL License version 2 use strict; +use POSIX; my $P = $0; $P =~ s@.*/@@g; @@ -399,37 +400,52 @@ sub seed_camelcase_includes { return if ($camelcase_seeded); my $files; - my $camelcase_git_file = ""; + my $camelcase_cache = ""; + my @include_files = (); + + $camelcase_seeded = 1; if (-d ".git") { my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; - $camelcase_git_file = ".checkpatch-camelcase.$git_last_include_commit"; - if (-f $camelcase_git_file) { - open(my $camelcase_file, '<', "$camelcase_git_file") - or warn "$P: Can't read '$camelcase_git_file' $!\n"; - while (<$camelcase_file>) { - chomp; - $camelcase{$_} = 1; - } - close($camelcase_file); - - return; - } - $files = `git ls-files include`; + $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; } else { + my $last_mod_date = 0; $files = `find $root/include -name "*.h"`; + @include_files = split('\n', $files); + foreach my $file (@include_files) { + my $date = POSIX::strftime("%Y%m%d%H%M", + localtime((stat $file)[9])); + $last_mod_date = $date if ($last_mod_date < $date); + } + $camelcase_cache = ".checkpatch-camelcase.date.$last_mod_date"; + } + + if ($camelcase_cache ne "" && -f $camelcase_cache) { + open(my $camelcase_file, '<', "$camelcase_cache") + or warn "$P: Can't read '$camelcase_cache' $!\n"; + while (<$camelcase_file>) { + chomp; + $camelcase{$_} = 1; + } + close($camelcase_file); + + return; + } + + if (-d ".git") { + $files = `git ls-files "include/*.h"`; + @include_files = split('\n', $files); } - my @include_files = split('\n', $files); + foreach my $file (@include_files) { seed_camelcase_file($file); } - $camelcase_seeded = 1; - if ($camelcase_git_file ne "") { + if ($camelcase_cache ne "") { unlink glob ".checkpatch-camelcase.*"; - open(my $camelcase_file, '>', "$camelcase_git_file") - or warn "$P: Can't write '$camelcase_git_file' $!\n"; + open(my $camelcase_file, '>', "$camelcase_cache") + or warn "$P: Can't write '$camelcase_cache' $!\n"; foreach (sort { lc($a) cmp lc($b) } keys(%camelcase)) { print $camelcase_file ("$_\n"); } -- cgit v0.10.2 From 2417898b34ad3fbf2f31771c97ba87792bf97f0c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Jul 2013 16:00:44 -0700 Subject: ncpfs: fix error return code in ncp_parse_options() Fix to return -EINVAL from the option parse error handling case instead of 0, as done elsewhere in this function. Signed-off-by: Wei Yongjun Cc: Petr Vandrovec Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 0765ad1..4659da6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options) switch (optval) { case 'u': data->uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->uid)) + if (!uid_valid(data->uid)) { + ret = -EINVAL; goto err; + } break; case 'g': data->gid = make_kgid(current_user_ns(), optint); - if (!gid_valid(data->gid)) + if (!gid_valid(data->gid)) { + ret = -EINVAL; goto err; + } break; case 'o': data->mounted_uid = make_kuid(current_user_ns(), optint); - if (!uid_valid(data->mounted_uid)) + if (!uid_valid(data->mounted_uid)) { + ret = -EINVAL; goto err; + } break; case 'm': data->file_mode = optint; -- cgit v0.10.2 From 4e80b1880c5a31d051d1e4a7377dec0a20701c23 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 8 Jul 2013 16:00:45 -0700 Subject: drivers/rtc/rtc-stmp3xxx.c: check the return value from stmp_reset_block() stmp_reset_block() may fail, so let's check its return value and propagate it in the case of error. Signed-off-by: Fabio Estevam Acked-by: Shawn Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 90a3e86..767fee2 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -261,7 +261,12 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc_data); - stmp_reset_block(rtc_data->io); + err = stmp_reset_block(rtc_data->io); + if (err) { + dev_err(&pdev->dev, "stmp_reset_block failed: %d\n", err); + return err; + } + writel(STMP3XXX_RTC_PERSISTENT0_ALARM_EN | STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE_EN | STMP3XXX_RTC_PERSISTENT0_ALARM_WAKE, -- cgit v0.10.2 From 6e5b93ee55d401f1619092fb675b57c28c9ed7ec Mon Sep 17 00:00:00 2001 From: Mike Lockwood Date: Mon, 8 Jul 2013 16:00:46 -0700 Subject: fatfs: add FAT_IOCTL_GET_VOLUME_ID This patch, originally from Android kernel, adds vfat ioctl command FAT_IOCTL_GET_VOLUME_ID, with this command we can get the vfat volume ID using following code: ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &volume_ID) This patch is a modified version of the patch by Mike Lockwood, with changes from Dmitry Pervushin, who noticed the original patch makes some volume IDs abiguous with error returns: for example, if volume id is 0xFFFFFDAD, that matches -ENOIOCTLCMD, we get "FFFFFFFF" from the user space. So add a parameter to ioctl to get the correct volume ID. Android uses vfat volume ID to identify different sd card, when a new sd card is inserted to device, android can scan the media on it and pop up new contents. Signed-off-by: Bintian Wang Cc: dmitry pervushin Cc: Mike Lockwood Cc: Colin Cross Acked-by: OGAWA Hirofumi Cc: John Stultz Cc: Sean McNeil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 21664fc..4241e6f 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -86,6 +86,7 @@ struct msdos_sb_info { const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ + unsigned int vol_id; /*volume ID*/ int fatent_shift; struct fatent_operations *fatent_ops; diff --git a/fs/fat/file.c b/fs/fat/file.c index b0b632e..9b104f5 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -114,6 +114,12 @@ out: return err; } +static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + return put_user(sbi->vol_id, user_attr); +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_get_attributes(inode, user_attr); case FAT_IOCTL_SET_ATTRIBUTES: return fat_ioctl_set_attributes(filp, user_attr); + case FAT_IOCTL_GET_VOLUME_ID: + return fat_ioctl_get_volume_id(inode, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 5d4513c..11b51bb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, brelse(fsinfo_bh); } + /* interpret volume ID as a little endian 32 bit integer */ + if (sbi->fat_bits == 32) + sbi->vol_id = (((u32)b->fat32.vol_id[0]) | + ((u32)b->fat32.vol_id[1] << 8) | + ((u32)b->fat32.vol_id[2] << 16) | + ((u32)b->fat32.vol_id[3] << 24)); + else /* fat 16 or 12 */ + sbi->vol_id = (((u32)b->fat16.vol_id[0]) | + ((u32)b->fat16.vol_id[1] << 8) | + ((u32)b->fat16.vol_id[2] << 16) | + ((u32)b->fat16.vol_id[3] << 24)); + sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; diff --git a/include/uapi/linux/msdos_fs.h b/include/uapi/linux/msdos_fs.h index f055e58..e284ff9 100644 --- a/include/uapi/linux/msdos_fs.h +++ b/include/uapi/linux/msdos_fs.h @@ -104,6 +104,8 @@ struct __fat_dirent { /* has used 0x72 ('r') in collision, so skip a few */ #define FAT_IOCTL_GET_ATTRIBUTES _IOR('r', 0x10, __u32) #define FAT_IOCTL_SET_ATTRIBUTES _IOW('r', 0x11, __u32) +/*Android kernel has used 0x12, so we use 0x13*/ +#define FAT_IOCTL_GET_VOLUME_ID _IOR('r', 0x13, __u32) struct fat_boot_sector { __u8 ignored[3]; /* Boot strap short or near jump */ @@ -128,6 +130,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat16; @@ -147,6 +153,10 @@ struct fat_boot_sector { __u8 drive_number; /* Physical drive number */ __u8 state; /* undocumented, but used for mount state. */ + __u8 signature; /* extended boot signature */ + __u8 vol_id[4]; /* volume ID */ + __u8 vol_label[11]; /* volume label */ + __u8 fs_type[8]; /* file system type */ /* other fiealds are not added here */ } fat32; }; -- cgit v0.10.2 From 02be46fba4b154b4a201a729b2d2b4ff6affd031 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:47 -0700 Subject: ptrace/x86: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 87dc669ba257 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. The patch only removes ptrace_get_breakpoints/ptrace_put_breakpoints and does a couple of "while at it" cleanups, it doesn't remove other changes from the reverted commit. Signed-off-by: Oleg Nesterov Acked-by: Ingo Molnar Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 29a8120..7a98b21 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -641,9 +641,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -692,9 +689,7 @@ restore: goto restore; } - ptrace_put_breakpoints(tsk); - - return ((orig_ret < 0) ? orig_ret : rc); + return orig_ret < 0 ? orig_ret : rc; } /* @@ -706,18 +701,10 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp; + struct perf_event *bp = thread->ptrace_bps[n]; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - - bp = thread->ptrace_bps[n]; - if (!bp) - val = 0; - else + if (bp) val = bp->hw.info.address; - - ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -734,9 +721,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event_attr attr; int err = 0; - if (ptrace_get_breakpoints(tsk) < 0) - return -ESRCH; - if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); /* @@ -762,7 +746,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, */ if (IS_ERR(bp)) { err = PTR_ERR(bp); - goto put; + goto out; } t->ptrace_bps[nr] = bp; @@ -773,9 +757,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } - -put: - ptrace_put_breakpoints(tsk); +out: return err; } -- cgit v0.10.2 From 6961ed96f14463d7c6e38d8c2093f5d53bd70574 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:49 -0700 Subject: ptrace/powerpc: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit 07fa7a0a8a58 ("hw_breakpoints: Fix racy access to ptrace breakpoints") and removes ptrace_get/put_breakpoints() added by other commits. The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 64f7bd5..9a0d24c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -975,16 +975,12 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; hw_brk.len = 8; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(task) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if ((!data) || !(hw_brk.type & HW_BRK_TYPE_RDWR)) { if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } - ptrace_put_breakpoints(task); return 0; } if (bp) { @@ -997,11 +993,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ret = modify_user_hw_breakpoint(bp, &attr); if (ret) { - ptrace_put_breakpoints(task); return ret; } thread->ptrace_bps[0] = bp; - ptrace_put_breakpoints(task); thread->hw_brk = hw_brk; return 0; } @@ -1016,12 +1010,9 @@ int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, ptrace_triggered, NULL, task); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(task); return PTR_ERR(bp); } - ptrace_put_breakpoints(task); - #endif /* CONFIG_HAVE_HW_BREAKPOINT */ task->thread.hw_brk = hw_brk; #else /* CONFIG_PPC_ADV_DEBUG_REGS */ @@ -1440,26 +1431,19 @@ static long ppc_set_hwdebug(struct task_struct *child, if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - /* * Check if the request is for 'range' breakpoints. We can * support it if range < 8 bytes. */ - if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { + if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) len = bp_info->addr2 - bp_info->addr; - } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) len = 1; - else { - ptrace_put_breakpoints(child); + else return -EINVAL; - } bp = thread->ptrace_bps[0]; - if (bp) { - ptrace_put_breakpoints(child); + if (bp) return -ENOSPC; - } /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); @@ -1471,11 +1455,9 @@ static long ppc_set_hwdebug(struct task_struct *child, ptrace_triggered, NULL, child); if (IS_ERR(bp)) { thread->ptrace_bps[0] = NULL; - ptrace_put_breakpoints(child); return PTR_ERR(bp); } - ptrace_put_breakpoints(child); return 1; #endif /* CONFIG_HAVE_HW_BREAKPOINT */ @@ -1519,16 +1501,12 @@ static long ppc_del_hwdebug(struct task_struct *child, long data) return -EINVAL; #ifdef CONFIG_HAVE_HW_BREAKPOINT - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - bp = thread->ptrace_bps[0]; if (bp) { unregister_hw_breakpoint(bp); thread->ptrace_bps[0] = NULL; } else ret = -ENOENT; - ptrace_put_breakpoints(child); return ret; #else /* CONFIG_HAVE_HW_BREAKPOINT */ if (child->thread.hw_brk.address == 0) -- cgit v0.10.2 From 6af9df7f5ba35806a5919d3a36d95fd40e210b89 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:51 -0700 Subject: ptrace/arm: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit bf0b8f4b55e5 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Acked-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 2bc1514..0dd3b79 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -886,20 +886,12 @@ long arch_ptrace(struct task_struct *child, long request, #ifdef CONFIG_HAVE_HW_BREAKPOINT case PTRACE_GETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_gethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; case PTRACE_SETHBPREGS: - if (ptrace_get_breakpoints(child) < 0) - return -ESRCH; - ret = ptrace_sethbpregs(child, addr, (unsigned long __user *)data); - ptrace_put_breakpoints(child); break; #endif -- cgit v0.10.2 From e8c073c4ff51207f5c1c37fb054360bbc0f38251 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:52 -0700 Subject: ptrace/sh: revert "hw_breakpoints: Fix racy access to ptrace breakpoints" This reverts commit e0ac8457d020 ("hw_breakpoints: Fix racy access to ptrace breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Signed-off-by: Oleg Nesterov Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index 81f999a..668c816 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -117,11 +117,7 @@ void user_enable_single_step(struct task_struct *child) set_tsk_thread_flag(child, TIF_SINGLESTEP); - if (ptrace_get_breakpoints(child) < 0) - return; - set_single_step(child, pc); - ptrace_put_breakpoints(child); } void user_disable_single_step(struct task_struct *child) -- cgit v0.10.2 From 7c8df28633bf0b7eb253f866029be0ac59ddb062 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:54 -0700 Subject: ptrace: revert "Prepare to fix racy accesses on task breakpoints" This reverts commit bf26c018490c ("Prepare to fix racy accesses on task breakpoints"). The patch was fine but we can no longer race with SIGKILL after commit 9899d11f6544 ("ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL"), the __TASK_TRACED tracee can't be woken up and ->ptrace_bps[] can't go away. Now that ptrace_get_breakpoints/ptrace_put_breakpoints have no callers, we can kill them and remove task->ptrace_bp_refcnt. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Michael Neuling Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 89573a3..07d0df6 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -142,9 +142,6 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_set(&child->ptrace_bp_refcnt, 1); -#endif child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; @@ -351,11 +348,4 @@ extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); -#ifdef CONFIG_HAVE_HW_BREAKPOINT -extern int ptrace_get_breakpoints(struct task_struct *tsk); -extern void ptrace_put_breakpoints(struct task_struct *tsk); -#else -static inline void ptrace_put_breakpoints(struct task_struct *tsk) { } -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ - #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index cdd5407..75324d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1401,9 +1401,6 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; #endif -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_t ptrace_bp_refcnt; -#endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif diff --git a/kernel/exit.c b/kernel/exit.c index fafe75d..a949819 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -808,7 +808,7 @@ void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ba5e6ce..a146ee3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ - if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) - return 0; - - return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ - if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) - flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v0.10.2 From e6a7d6077106e5c72f0519ec113d986df67ee001 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:56 -0700 Subject: ptrace/x86: simplify the "disable" logic in ptrace_write_dr7() ptrace_write_dr7() looks unnecessarily overcomplicated. We can factor out ptrace_modify_breakpoint() and do not do "continue" twice, just we need to pass the proper "disabled" argument to ptrace_modify_breakpoint(). Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7a98b21..0649f16 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -637,9 +637,7 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) struct thread_struct *thread = &(tsk->thread); unsigned long old_dr7; int i, orig_ret = 0, rc = 0; - int enabled, second_pass = 0; - unsigned len, type; - struct perf_event *bp; + int second_pass = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); @@ -649,30 +647,22 @@ restore: * appropriate changes to each. */ for (i = 0; i < HBP_NUM; i++) { - enabled = decode_dr7(data, i, &len, &type); - bp = thread->ptrace_bps[i]; - - if (!enabled) { - if (bp) { - /* - * Don't unregister the breakpoints right-away, - * unless all register_user_hw_breakpoint() - * requests have succeeded. This prevents - * any window of opportunity for debug - * register grabbing by other users. - */ - if (!second_pass) - continue; - - rc = ptrace_modify_breakpoint(bp, len, type, - tsk, 1); - if (rc) - break; - } - continue; + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (disabled) { + /* + * Don't unregister the breakpoints right-away, unless + * all register_user_hw_breakpoint() requests have + * succeeded. This prevents any window of opportunity + * for debug register grabbing by other users. + */ + if (!bp || !second_pass) + continue; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } -- cgit v0.10.2 From 29a55513414187b50d3cebb99884955a78d97283 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:58 -0700 Subject: ptrace/x86: dont delay "disable" till second pass in ptrace_write_dr7() ptrace_write_dr7() skips ptrace_modify_breakpoint(disabled => true) unless second_pass, this buys nothing but complicates the code and means that we always do the main loop twice even if "disabled" was never true. The comment says: Don't unregister the breakpoints right-away, unless all register_user_hw_breakpoint() requests have succeeded. Firstly, we do not do register_user_hw_breakpoint(), it was removed by commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events"). We are going to restore register_user_hw_breakpoint() (see the next patch) but this doesn't matter: after commit 44234adcdce3 ("hw-breakpoints: Modify breakpoints without unregistering them") perf_event_disable() can not hurt, hw_breakpoint_del() does not free the slot. Remove the "second_pass" check from the main loop and simplify the code. Since we have to check "bp != NULL" anyway, the patch also removes the same check in ptrace_modify_breakpoint() and moves the comment into ptrace_write_dr7(). With this patch the second pass is only needed to restore the saved old_dr7. This should never fail, so the patch adds WARN_ON() to catch the potential problems as Frederic suggested. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0649f16..98b0a2c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -609,14 +609,6 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int gen_len, gen_type; struct perf_event_attr attr; - /* - * We should have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) - return -EINVAL; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) return err; @@ -634,52 +626,47 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; - int i, orig_ret = 0, rc = 0; - int second_pass = 0; + bool second_pass = false; + int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + restore: - /* - * Loop through all the hardware breakpoints, making the - * appropriate changes to each. - */ + rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; - if (disabled) { + if (!bp) { + if (disabled) + continue; /* - * Don't unregister the breakpoints right-away, unless - * all register_user_hw_breakpoint() requests have - * succeeded. This prevents any window of opportunity - * for debug register grabbing by other users. + * We should have at least an inactive breakpoint at + * this slot. It means the user is writing dr7 without + * having written the address register first. */ - if (!bp || !second_pass) - continue; + rc = -EINVAL; + break; } rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); if (rc) break; } - /* - * Make a second pass to free the remaining unused breakpoints - * or to restore the original breakpoints if an error occurred. - */ - if (!second_pass) { - second_pass = 1; - if (rc < 0) { - orig_ret = rc; - data = old_dr7; - } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; goto restore; } - return orig_ret < 0 ? orig_ret : rc; + return ret; } /* -- cgit v0.10.2 From 9afe33ada275f2413dfeae27cc58fbb27474ac72 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:00:59 -0700 Subject: ptrace/x86: introduce ptrace_register_breakpoint() No functional changes, preparation. Extract the "register breakpoint" code from ptrace_get_debugreg() into the new/generic helper, ptrace_register_breakpoint(). It will have more users. The patch also adds another simple helper, ptrace_fill_bp_fields(), to factor out the arch_bp_generic_fields() logic in register/modify. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98b0a2c..0526368 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -601,22 +601,48 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static int -ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk, int disabled) +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) { - int err; - int gen_len, gen_type; struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; - err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) - return err; + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} - attr = bp->attr; - attr.bp_len = gen_len; - attr.bp_type = gen_type; - attr.disabled = disabled; +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; return modify_user_hw_breakpoint(bp, &attr); } @@ -653,7 +679,7 @@ restore: break; } - rc = ptrace_modify_breakpoint(bp, len, type, tsk, disabled); + rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } @@ -693,26 +719,14 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { - struct perf_event *bp; struct thread_struct *t = &tsk->thread; - struct perf_event_attr attr; + struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; - if (!t->ptrace_bps[nr]) { - ptrace_breakpoint_init(&attr); - /* - * Put stub len and type to register (reserve) an inactive but - * correct bp - */ - attr.bp_addr = addr; - attr.bp_len = HW_BREAKPOINT_LEN_1; - attr.bp_type = HW_BREAKPOINT_W; - attr.disabled = 1; - - bp = register_user_hw_breakpoint(&attr, ptrace_triggered, - NULL, tsk); - + if (!bp) { /* + * Put stub len and type to create an inactive but correct bp. + * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. @@ -721,20 +735,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) { + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) err = PTR_ERR(bp); - goto out; - } - - t->ptrace_bps[nr] = bp; + else + t->ptrace_bps[nr] = bp; } else { - bp = t->ptrace_bps[nr]; + struct perf_event_attr attr = bp->attr; - attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } -out: + return err; } -- cgit v0.10.2 From b87a95ad609619482df0690320d5ace33ace8e7a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:01 -0700 Subject: ptrace/x86: ptrace_write_dr7() should create bp if !disabled Commit 24f1e32c60c4 ("hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events") introduced the minor regression. Before this commit PTRACE_POKEUSER DR7, enableDR0 PTRACE_POKEUSER DR0, address was perfectly valid, now PTRACE_POKEUSER(DR7) fails if DR0 was not previously initialized by PTRACE_POKEUSER(DR0). Change ptrace_write_dr7() to do ptrace_register_breakpoint(addr => 0) if !bp && !disabled. This fixes watchpoint-zeroaddr from ptrace-tests, see https://bugzilla.redhat.com/show_bug.cgi?id=660204. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0526368..5c387b3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -670,13 +670,16 @@ restore: if (!bp) { if (disabled) continue; - /* - * We should have at least an inactive breakpoint at - * this slot. It means the user is writing dr7 without - * having written the address register first. - */ - rc = -EINVAL; - break; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); -- cgit v0.10.2 From 61e305c716c0737c97bd133313cc90e99a93712e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:03 -0700 Subject: ptrace/x86: cleanup ptrace_set_debugreg() ptrace_set_debugreg() is trivial but looks horrible. Kill the unnecessary goto's and return's to cleanup the code. This matches ptrace_get_debugreg() which also needs the trivial whitespace cleanups. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5c387b3..7461f50 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -703,7 +703,7 @@ restore: */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { - struct thread_struct *thread = &(tsk->thread); + struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { @@ -713,7 +713,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) val = bp->hw.info.address; } else if (n == 6) { val = thread->debugreg6; - } else if (n == 7) { + } else if (n == 7) { val = thread->ptrace_dr7; } return val; @@ -761,30 +761,20 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { - struct thread_struct *thread = &(tsk->thread); - int rc = 0; - + struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ - if (n == 4 || n == 5) - return -EIO; + int rc = -EIO; - if (n == 6) { - thread->debugreg6 = val; - goto ret_path; - } if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); - if (rc) - return rc; - } - /* All that's left is DR7 */ - if (n == 7) { + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } - -ret_path: return rc; } -- cgit v0.10.2 From fab840fc2d542fabcab903db8e03589a6702ba5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:05 -0700 Subject: ptrace: PTRACE_DETACH should do flush_ptrace_hw_breakpoint(child) Change ptrace_detach() to call flush_ptrace_hw_breakpoint(child). This frees the slots for non-ptrace PERF_TYPE_BREAKPOINT users, and this ensures that the tracee won't be killed by SIGTRAP triggered by the active breakpoints. Test-case: unsigned long encode_dr7(int drnum, int enable, unsigned int type, unsigned int len) { unsigned long dr7; dr7 = ((len | type) & 0xf) << (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); if (enable) dr7 |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return dr7; } int write_dr(int pid, int dr, unsigned long val) { return ptrace(PTRACE_POKEUSER, pid, offsetof (struct user, u_debugreg[dr]), val); } void func(void) { } int main(void) { int pid, stat; unsigned long dr7; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); kill(getpid(), SIGHUP); func(); return 0x13; } assert(pid == waitpid(-1, &stat, 0)); assert(WSTOPSIG(stat) == SIGHUP); assert(write_dr(pid, 0, (long)func) == 0); dr7 = encode_dr7(0, 1, DR_RW_EXECUTE, DR_LEN_1); assert(write_dr(pid, 7, dr7) == 0); assert(ptrace(PTRACE_DETACH, pid, 0,0) == 0); assert(pid == waitpid(-1, &stat, 0)); assert(stat == 0x1300); return 0; } Before this patch the child is killed after PTRACE_DETACH. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee3..4041f57 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + flush_ptrace_hw_breakpoint(child); write_lock_irq(&tasklist_lock); /* -- cgit v0.10.2 From f7da04c9e363e479258135ac825734d78aecd2b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 8 Jul 2013 16:01:06 -0700 Subject: ptrace/x86: flush_ptrace_hw_breakpoint() shoule clear the virtual debug registers flush_ptrace_hw_breakpoint() destroys the counters set by ptrace, but "leaks" ->debugreg6 and ->ptrace_dr7. The problem is minor, but still it doesn't look right and flush_thread() did this until commit 66cb59172959 ("hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code"). Now that PTRACE_DETACH does flush_ too this makes even more sense. Signed-off-by: Oleg Nesterov Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jan Kratochvil Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mundt Cc: Will Deacon Cc: Prasad Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 02f0763..f66ff16 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -393,6 +393,9 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) -- cgit v0.10.2 From c103a4dc4a32f53f095b66cd798d648c652f05b4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 8 Jul 2013 16:01:08 -0700 Subject: ipc/shmc.c: eliminate ugly 80-col tricks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/shm.c b/ipc/shm.c index 7e199fa..85dc001 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -491,10 +491,10 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) sprintf (name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) - & SHM_HUGE_MASK); + struct hstate *hs; size_t hugesize; + hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) { error = -EINVAL; goto no_file; diff --git a/mm/mmap.c b/mm/mmap.c index 0718c17..f813111 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1368,9 +1368,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; - struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & - SHM_HUGE_MASK); + struct hstate *hs; + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; -- cgit v0.10.2 From dbfcd91f06f0e2d5564b2fd184e9c2a43675f9ab Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:09 -0700 Subject: ipc: move rcu lock out of ipc_addid This patchset continues the work that began in the sysv ipc semaphore scaling series, see https://lkml.org/lkml/2013/3/20/546 Just like semaphores used to be, sysv shared memory and msg queues also abuse the ipc lock, unnecessarily holding it for operations such as permission and security checks. This patchset mostly deals with mqueues, and while shared mem can be done in a very similar way, I want to get these patches out in the open first. It also does some pending cleanups, mostly focused on the two level locking we have in ipc code, taking care of ipc_addid() and ipcctl_pre_down_nolock() - yes there are still functions that need to be updated as well. This patch: Make all callers explicitly take and release the RCU read lock. This addresses the two level locking seen in newary(), newseg() and newqueue(). For the last two, explicitly unlock the ipc object and the rcu lock, instead of calling the custom shm_unlock and msg_unlock functions. The next patch will deal with the open coded locking for ->perm.lock Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index d0c6d96..996feb8 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -199,9 +199,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) return retval; } - /* - * ipc_addid() locks msq - */ + /* ipc_addid() locks msq upon success. */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id < 0) { security_msg_queue_free(msq); @@ -218,7 +216,8 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - msg_unlock(msq); + spin_unlock(&msq->q_perm.lock); + rcu_read_unlock(); return msq->q_perm.id; } diff --git a/ipc/shm.c b/ipc/shm.c index 85dc001..bd2b14e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -535,6 +535,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. @@ -543,7 +544,9 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot += numpages; error = shp->shm_perm.id; - shm_unlock(shp); + + spin_unlock(&shp->shm_perm.lock); + rcu_read_unlock(); return error; no_id: diff --git a/ipc/util.c b/ipc/util.c index 809ec5e..399821a 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -246,9 +246,8 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with ipc_ids.rw_mutex held as a writer. + * Called with writer ipc_ids.rw_mutex held. */ - int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { kuid_t euid; -- cgit v0.10.2 From 1ca7003ab41152d673d9e359632283d05294f3d6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:10 -0700 Subject: ipc: introduce ipc object locking helpers Simple helpers around the (kern_ipc_perm *)->lock spinlock. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/util.h b/ipc/util.h index 2b0bdd5..da65e8a 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -159,23 +159,33 @@ static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) return uid / SEQ_MULTIPLIER != ipcp->seq; } -static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) +static inline void ipc_lock_object(struct kern_ipc_perm *perm) { - rcu_read_lock(); spin_lock(&perm->lock); } -static inline void ipc_unlock(struct kern_ipc_perm *perm) +static inline void ipc_unlock_object(struct kern_ipc_perm *perm) { spin_unlock(&perm->lock); - rcu_read_unlock(); } -static inline void ipc_lock_object(struct kern_ipc_perm *perm) +static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) { + assert_spin_locked(&perm->lock); +} + +static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) +{ + rcu_read_lock(); spin_lock(&perm->lock); } +static inline void ipc_unlock(struct kern_ipc_perm *perm) +{ + spin_unlock(&perm->lock); + rcu_read_unlock(); +} + struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, -- cgit v0.10.2 From cf9d5d78d05bca96df7618dfc3a5ee4414dcae58 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:11 -0700 Subject: ipc: close open coded spin lock calls Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index 996feb8..7a3d6aa 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -216,7 +216,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - spin_unlock(&msq->q_perm.lock); + ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); return msq->q_perm.id; diff --git a/ipc/sem.c b/ipc/sem.c index 70480a3..92ec6c6 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -246,7 +246,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * their critical section while the array lock is held. */ lock_array: - spin_lock(&sma->sem_perm.lock); + ipc_lock_object(&sma->sem_perm); for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); @@ -259,7 +259,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == -1) { - spin_unlock(&sma->sem_perm.lock); + ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; spin_unlock(&sem->lock); @@ -872,7 +872,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) int i; /* Free the existing undo structures for this semaphore set. */ - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry_safe(un, tu, &sma->list_id, list_id) { list_del(&un->list_id); spin_lock(&un->ulp->lock); @@ -1070,7 +1070,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, curr = &sma->sem_base[semnum]; - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) un->semadj[semnum] = 0; @@ -1199,7 +1199,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, for (i = 0; i < nsems; i++) sma->sem_base[i].semval = sem_io[i]; - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { for (i = 0; i < nsems; i++) un->semadj[i] = 0; @@ -1496,7 +1496,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) new->semid = semid; assert_spin_locked(&ulp->lock); list_add_rcu(&new->list_proc, &ulp->list_proc); - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; @@ -1833,7 +1833,7 @@ void exit_sem(struct task_struct *tsk) } /* remove un from the linked lists */ - assert_spin_locked(&sma->sem_perm.lock); + ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); spin_lock(&ulp->lock); diff --git a/ipc/shm.c b/ipc/shm.c index bd2b14e..e7d5107 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -141,7 +141,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) { rcu_read_lock(); - spin_lock(&ipcp->shm_perm.lock); + ipc_lock_object(&ipcp->shm_perm); } static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, @@ -545,7 +545,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot += numpages; error = shp->shm_perm.id; - spin_unlock(&shp->shm_perm.lock); + ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); return error; diff --git a/ipc/util.h b/ipc/util.h index da65e8a..b6a6a88 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -177,12 +177,12 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) { rcu_read_lock(); - spin_lock(&perm->lock); + ipc_lock_object(perm); } static inline void ipc_unlock(struct kern_ipc_perm *perm) { - spin_unlock(&perm->lock); + ipc_unlock_object(perm); rcu_read_unlock(); } -- cgit v0.10.2 From 7b4cc5d8411bd4e9d61d8714f53859740cf830c2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:12 -0700 Subject: ipc: move locking out of ipcctl_pre_down_nolock This function currently acquires both the rw_mutex and the rcu lock on successful lookups, leaving the callers to explicitly unlock them, creating another two level locking situation. Make the callers (including those that still use ipcctl_pre_down()) explicitly lock and unlock the rwsem and rcu lock. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index 7a3d6aa..f62fa5e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -407,31 +407,38 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } + down_write(&msg_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(msq, cmd); if (err) - goto out_unlock; + goto out_unlock0; switch (cmd) { case IPC_RMID: + /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; - goto out_unlock; + goto out_unlock0; } err = ipc_update_perm(&msqid64.msg_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; msq->q_qbytes = msqid64.msg_qbytes; @@ -448,8 +455,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, default: err = -EINVAL; } -out_unlock: - msg_unlock(msq); + +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); out_up: up_write(&msg_ids(ns).rw_mutex); return err; diff --git a/ipc/sem.c b/ipc/sem.c index 92ec6c6..b4b892b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1289,39 +1289,44 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } + down_write(&sem_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, &semid64.sem_perm, 0); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } sma = container_of(ipcp, struct sem_array, sem_perm); err = security_sem_semctl(sma, cmd); - if (err) { - rcu_read_unlock(); - goto out_up; - } + if (err) + goto out_unlock1; - switch(cmd){ + switch (cmd) { case IPC_RMID: sem_lock(sma, NULL, -1); + /* freeary unlocks the ipc object and rcu */ freeary(ns, ipcp); goto out_up; case IPC_SET: sem_lock(sma, NULL, -1); err = ipc_update_perm(&semid64.sem_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; sma->sem_ctime = get_seconds(); break; default: - rcu_read_unlock(); err = -EINVAL; - goto out_up; + goto out_unlock1; } -out_unlock: +out_unlock0: sem_unlock(sma, -1); +out_unlock1: rcu_read_unlock(); out_up: up_write(&sem_ids(ns).rw_mutex); diff --git a/ipc/shm.c b/ipc/shm.c index e7d5107..c6b4ad5 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -757,31 +757,42 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } + down_write(&shm_ids(ns).rw_mutex); + rcu_read_lock(); + ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, &shmid64.shm_perm, 0); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + /* the ipc lock is not held upon failure */ + goto out_unlock1; + } shp = container_of(ipcp, struct shmid_kernel, shm_perm); err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock; + goto out_unlock0; + switch (cmd) { case IPC_RMID: + /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) - goto out_unlock; + goto out_unlock0; shp->shm_ctim = get_seconds(); break; default: err = -EINVAL; } -out_unlock: - shm_unlock(shp); + +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); out_up: up_write(&shm_ids(ns).rw_mutex); return err; diff --git a/ipc/util.c b/ipc/util.c index 399821a..a0c139f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -746,8 +746,10 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with both ipc and rw_mutex locks held in case of success + * - returns the ipc with the ipc lock held in case of success * or an err-code without any lock held otherwise. + * + * Call holding the both the rw_mutex and the rcu read lock. */ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, @@ -772,13 +774,10 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, int err = -EPERM; struct kern_ipc_perm *ipcp; - down_write(&ids->rw_mutex); - rcu_read_lock(); - ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - goto out_up; + goto err; } audit_ipc_obj(ipcp); @@ -789,16 +788,8 @@ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ipcp; - -out_up: - /* - * Unsuccessful lookup, unlock and return - * the corresponding error. - */ - rcu_read_unlock(); - up_write(&ids->rw_mutex); - + return ipcp; /* successful lookup */ +err: return ERR_PTR(err); } -- cgit v0.10.2 From 15724ecb7e9bab35fc694c666ad563adba820cc3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:13 -0700 Subject: ipc,msg: shorten critical region in msgctl_down Instead of holding the ipc lock for the entire function, use the ipcctl_pre_down_nolock and only acquire the lock for specific commands: RMID and SET. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index f62fa5e..de422ff 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -410,11 +410,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, down_write(&msg_ids(ns).rw_mutex); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, - &msqid64.msg_perm, msqid64.msg_qbytes); + ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, + &msqid64.msg_perm, msqid64.msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -422,10 +421,11 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, err = security_msg_queue_msgctl(msq, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&msq->q_perm); /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; @@ -433,9 +433,10 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; - goto out_unlock0; + goto out_unlock1; } + ipc_lock_object(&msq->q_perm); err = ipc_update_perm(&msqid64.msg_perm, ipcp); if (err) goto out_unlock0; @@ -454,6 +455,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: -- cgit v0.10.2 From 2cafed30f150f7314f98717b372df8173516cae0 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:14 -0700 Subject: ipc,msg: introduce msgctl_nolock Similar to semctl, when calling msgctl, the *_INFO and *_STAT commands can be performed without acquiring the ipc object. Add a msgctl_nolock() function and move the logic of *_INFO and *_STAT out of msgctl(). This change still takes the lock and it will be properly lockless in the next patch Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index de422ff..f45be81 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -467,17 +467,11 @@ out_up: return err; } -SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +static int msgctl_nolock(struct ipc_namespace *ns, int msqid, + int cmd, int version, void __user *buf) { + int err; struct msg_queue *msq; - int err, version; - struct ipc_namespace *ns; - - if (msqid < 0 || cmd < 0) - return -EINVAL; - - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: @@ -488,6 +482,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) if (!buf) return -EFAULT; + /* * We must not return kernel stack data. * due to padding, it's not enough @@ -519,7 +514,8 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) return -EFAULT; return (max_id < 0) ? 0 : max_id; } - case MSG_STAT: /* msqid is an index rather than a msg queue id */ + + case MSG_STAT: case IPC_STAT: { struct msqid64_ds tbuf; @@ -563,19 +559,42 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) return -EFAULT; return success_return; } - case IPC_SET: - case IPC_RMID: - err = msgctl_down(ns, msqid, cmd, buf, version); - return err; + default: - return -EINVAL; + return -EINVAL; } + return err; out_unlock: msg_unlock(msq); return err; } +SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +{ + int version; + struct ipc_namespace *ns; + + if (msqid < 0 || cmd < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case MSG_INFO: + case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case IPC_STAT: + return msgctl_nolock(ns, msqid, cmd, version, buf); + case IPC_SET: + case IPC_RMID: + return msgctl_down(ns, msqid, cmd, buf, version); + default: + return -EINVAL; + } +} + static int testmsg(struct msg_msg *msg, long type, int mode) { switch(mode) -- cgit v0.10.2 From a5001a0d9768568de5d613c3b3a5b9c7721299da Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:15 -0700 Subject: ipc,msg: introduce lockless functions to obtain the ipc object Add msq_obtain_object() and msq_obtain_object_check(), which will allow us to get the ipc object without acquiring the lock. Just as with semaphores, these functions are basically wrappers around ipc_obtain_object*(). Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index f45be81..c53c137 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -166,6 +166,27 @@ static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, return container_of(ipcp, struct msg_queue, q_perm); } +static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct msg_queue, q_perm); +} + +static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, + int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct msg_queue, q_perm); +} + static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) { ipc_rmid(&msg_ids(ns), &s->q_perm); -- cgit v0.10.2 From ac0ba20ea6f2201a1589d6dc26ad1a4f0f967bb8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:16 -0700 Subject: ipc,msg: make msgctl_nolock lockless While the INFO cmd doesn't take the ipc lock, the STAT commands do acquire it unnecessarily. We can do the permissions and security checks only holding the rcu lock. This function now mimics semctl_nolock(). Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index c53c137..c218328 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -545,17 +545,25 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, if (!buf) return -EFAULT; + memset(&tbuf, 0, sizeof(tbuf)); + + rcu_read_lock(); if (cmd == MSG_STAT) { - msq = msg_lock(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = msq->q_perm.id; } else { - msq = msg_lock_check(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object_check(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = 0; } + err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; @@ -564,8 +572,6 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, if (err) goto out_unlock; - memset(&tbuf, 0, sizeof(tbuf)); - kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm); tbuf.msg_stime = msq->q_stime; tbuf.msg_rtime = msq->q_rtime; @@ -575,7 +581,8 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, tbuf.msg_qbytes = msq->q_qbytes; tbuf.msg_lspid = msq->q_lspid; tbuf.msg_lrpid = msq->q_lrpid; - msg_unlock(msq); + rcu_read_unlock(); + if (copy_msqid_to_user(buf, &tbuf, version)) return -EFAULT; return success_return; @@ -587,7 +594,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, return err; out_unlock: - msg_unlock(msq); + rcu_read_unlock(); return err; } -- cgit v0.10.2 From 3dd1f784ed6603d7ab1043e51e6371235edf2313 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:17 -0700 Subject: ipc,msg: shorten critical region in msgsnd do_msgsnd() is another function that does too many things with the ipc object lock acquired. Take it only when needed when actually updating msq. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index c218328..f2a1a8f 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -698,10 +698,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock_check(ns, msqid); + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); - goto out_free; + goto out_unlock1; } for (;;) { @@ -709,11 +710,11 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) - goto out_unlock_free; + goto out_unlock1; err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) - goto out_unlock_free; + goto out_unlock1; if (msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes) { @@ -723,32 +724,41 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; - goto out_unlock_free; + goto out_unlock1; } + + ipc_lock_object(&msq->q_perm); ss_add(msq, &s); if (!ipc_rcu_getref(msq)) { err = -EIDRM; - goto out_unlock_free; + goto out_unlock0; } - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); - ipc_lock_by_ptr(&msq->q_perm); + rcu_read_lock(); + ipc_lock_object(&msq->q_perm); + ipc_rcu_putref(msq); if (msq->q_perm.deleted) { err = -EIDRM; - goto out_unlock_free; + goto out_unlock0; } + ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; - goto out_unlock_free; + goto out_unlock0; } + + ipc_unlock_object(&msq->q_perm); } + ipc_lock_object(&msq->q_perm); msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); @@ -764,9 +774,10 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = 0; msg = NULL; -out_unlock_free: - msg_unlock(msq); -out_free: +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; -- cgit v0.10.2 From 41a0d523d0f626e9da0dc01de47f1b89058033cf Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:18 -0700 Subject: ipc,msg: shorten critical region in msgrcv do_msgrcv() is the last msg queue function that abuses the ipc lock Take it only when needed when actually updating msq. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index f2a1a8f..a3c0dc4 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -885,21 +885,19 @@ static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) return ERR_PTR(-EAGAIN); } - -long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, - int msgflg, +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { - struct msg_queue *msq; - struct msg_msg *msg; int mode; + struct msg_queue *msq; struct ipc_namespace *ns; - struct msg_msg *copy = NULL; + struct msg_msg *msg, *copy = NULL; ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; + if (msgflg & MSG_COPY) { copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) @@ -907,8 +905,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, } mode = convert_mode(&msgtyp, msgflg); - msq = msg_lock_check(ns, msqid); + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { + rcu_read_unlock(); free_copy(copy); return PTR_ERR(msq); } @@ -918,10 +918,10 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + goto out_unlock1; + ipc_lock_object(&msq->q_perm); msg = find_msg(msq, &msgtyp, mode); - if (!IS_ERR(msg)) { /* * Found a suitable message. @@ -929,7 +929,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); - goto out_unlock; + goto out_unlock0; } /* * If we are copying, then do not unlink message and do @@ -937,8 +937,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); - goto out_unlock; + goto out_unlock0; } + list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); @@ -947,14 +948,16 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); ss_wakeup(&msq->q_senders, 0); - msg_unlock(msq); - break; + + goto out_unlock0; } + /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); - goto out_unlock; + goto out_unlock0; } + list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; @@ -965,8 +968,9 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, msr_d.r_maxsize = bufsz; msr_d.r_msg = ERR_PTR(-EAGAIN); current->state = TASK_INTERRUPTIBLE; - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); /* Lockless receive, part 1: @@ -977,7 +981,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. * rcu_read_lock() prevents preemption between reading r_msg - * and the spin_lock() inside ipc_lock_by_ptr(). + * and acquiring the q_perm.lock in ipc_lock_object(). */ rcu_read_lock(); @@ -996,32 +1000,34 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, * If there is a message or an error then accept it without * locking. */ - if (msg != ERR_PTR(-EAGAIN)) { - rcu_read_unlock(); - break; - } + if (msg != ERR_PTR(-EAGAIN)) + goto out_unlock1; /* Lockless receive, part 3: * Acquire the queue spinlock. */ - ipc_lock_by_ptr(&msq->q_perm); - rcu_read_unlock(); + ipc_lock_object(&msq->q_perm); /* Lockless receive, part 4: * Repeat test after acquiring the spinlock. */ msg = (struct msg_msg*)msr_d.r_msg; if (msg != ERR_PTR(-EAGAIN)) - goto out_unlock; + goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); -out_unlock: - msg_unlock(msq); - break; + goto out_unlock0; } + + ipc_unlock_object(&msq->q_perm); } + +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); if (IS_ERR(msg)) { free_copy(copy); return PTR_ERR(msg); -- cgit v0.10.2 From 9ad66ae65fc8d3e7e3344310fb0aa835910264fe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Jul 2013 16:01:19 -0700 Subject: ipc: remove unused functions We can now drop the msg_lock and msg_lock_check functions along with a bogus comment introduced previously in semctl_down. Signed-off-by: Davidlohr Bueso Cc: Andi Kleen Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index a3c0dc4..bd60d7e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -141,31 +141,6 @@ void __init msg_init(void) IPC_MSG_IDS, sysvipc_msg_proc_show); } -/* - * msg_lock_(check_) routines are called in the paths where the rw_mutex - * is not held. - */ -static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; - - return container_of(ipcp, struct msg_queue, q_perm); -} - -static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; - - return container_of(ipcp, struct msg_queue, q_perm); -} - static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id); diff --git a/ipc/sem.c b/ipc/sem.c index b4b892b..d3ad357 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1296,7 +1296,6 @@ static int semctl_down(struct ipc_namespace *ns, int semid, &semid64.sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } -- cgit v0.10.2 From 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:20 -0700 Subject: ipc/util.c, ipc_rcu_alloc: cacheline align allocation Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP. Rationale: The SysV sem code tries to move the main spinlock into a seperate cacheline (____cacheline_aligned_in_smp). This works only if ipc_rcu_alloc returns cacheline aligned pointers. vmalloc and kmalloc return cacheline algined pointers, the implementation of ipc_rcu_alloc breaks that. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/util.c b/ipc/util.c index a0c139f..4704223 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -468,9 +468,7 @@ void ipc_free(void* ptr, int size) struct ipc_rcu { struct rcu_head rcu; atomic_t refcount; - /* "void *" makes sure alignment of following data is sane. */ - void *data[0]; -}; +} ____cacheline_aligned_in_smp; /** * ipc_rcu_alloc - allocate ipc and rcu space @@ -488,12 +486,14 @@ void *ipc_rcu_alloc(int size) if (unlikely(!out)) return NULL; atomic_set(&out->refcount, 1); - return out->data; + return out + 1; } int ipc_rcu_getref(void *ptr) { - return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount); + struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; + + return atomic_inc_not_zero(&p->refcount); } /** @@ -507,7 +507,7 @@ static void ipc_schedule_free(struct rcu_head *head) void ipc_rcu_putref(void *ptr) { - struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data); + struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1; if (!atomic_dec_and_test(&p->refcount)) return; -- cgit v0.10.2 From f5c936c0f267ec58641451cf8b8d39b4c207ee4d Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:22 -0700 Subject: ipc/sem.c: cacheline align the semaphore structures As now each semaphore has its own spinlock and parallel operations are possible, give each semaphore its own cacheline. On a i3 laptop, this gives up to 28% better performance: #semscale 10 | grep "interleave 2" - before: Cpus 1, interleave 2 delay 0: 36109234 in 10 secs Cpus 2, interleave 2 delay 0: 55276317 in 10 secs Cpus 3, interleave 2 delay 0: 62411025 in 10 secs Cpus 4, interleave 2 delay 0: 81963928 in 10 secs -after: Cpus 1, interleave 2 delay 0: 35527306 in 10 secs Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28% Cpus 3, interleave 2 delay 0: 80518538 in 10 secs Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7% i3, with 2 cores and with hyperthreading enabled. Interleave 2 in order use first the full cores. HT partially hides the delay from cacheline trashing, thus the improvement is "only" 8.7% if 4 threads are running. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/sem.c b/ipc/sem.c index d3ad357..8498b67 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -96,7 +96,7 @@ struct sem { int sempid; /* pid of last operation */ spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head sem_pending; /* pending single-sop operations */ -}; +} ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ struct sem_queue { -- cgit v0.10.2 From 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:23 -0700 Subject: ipc/sem: separate wait-for-zero and alter tasks into seperate queues Introduce separate queues for operations that do not modify the semaphore values. Advantages: - Simpler logic in check_restart(). - Faster update_queue(): Right now, all wait-for-zero operations are always tested, even if the semaphore value is not 0. - wait-for-zero gets again priority, as in linux <=3.0.9 Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sem.h b/include/linux/sem.h index 53d4265..55e17f6 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -15,7 +15,10 @@ struct sem_array { time_t sem_otime; /* last semop time */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ - struct list_head sem_pending; /* pending operations to be processed */ + struct list_head pending_alter; /* pending operations */ + /* that alter the array */ + struct list_head pending_const; /* pending complex operations */ + /* that do not alter semvals */ struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ diff --git a/ipc/sem.c b/ipc/sem.c index 8498b67..4d7f88c 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -95,7 +95,10 @@ struct sem { int semval; /* current value */ int sempid; /* pid of last operation */ spinlock_t lock; /* spinlock for fine-grained semtimedop */ - struct list_head sem_pending; /* pending single-sop operations */ + struct list_head pending_alter; /* pending single-sop operations */ + /* that alter the semaphore */ + struct list_head pending_const; /* pending single-sop operations */ + /* that do not alter the semaphore*/ } ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ @@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); /* * linked list protection: * sem_undo.id_next, - * sem_array.sem_pending{,last}, + * sem_array.pending{_alter,_cont}, * sem_array.sem_undo: sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. * @@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * Without the check/retry algorithm a lockless wakeup is possible: * - queue.status is initialized to -EINTR before blocking. * - wakeup is performed by - * * unlinking the queue entry from sma->sem_pending + * * unlinking the queue entry from the pending list * * setting queue.status to IN_WAKEUP * This is the notification for the blocked thread that a * result value is imminent. @@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_base = (struct sem *) &sma[1]; for (i = 0; i < nsems; i++) { - INIT_LIST_HEAD(&sma->sem_base[i].sem_pending); + INIT_LIST_HEAD(&sma->sem_base[i].pending_alter); + INIT_LIST_HEAD(&sma->sem_base[i].pending_const); spin_lock_init(&sma->sem_base[i].lock); } sma->complex_count = 0; - INIT_LIST_HEAD(&sma->sem_pending); + INIT_LIST_HEAD(&sma->pending_alter); + INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); @@ -609,60 +614,132 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) * update_queue is O(N^2) when it restarts scanning the whole queue of * waiting operations. Therefore this function checks if the restart is * really necessary. It is called after a previously waiting operation - * was completed. + * modified the array. + * Note that wait-for-zero operations are handled without restart. */ static int check_restart(struct sem_array *sma, struct sem_queue *q) { - struct sem *curr; - struct sem_queue *h; - - /* if the operation didn't modify the array, then no restart */ - if (q->alter == 0) - return 0; - - /* pending complex operations are too difficult to analyse */ - if (sma->complex_count) + /* pending complex alter operations are too difficult to analyse */ + if (!list_empty(&sma->pending_alter)) return 1; /* we were a sleeping complex operation. Too difficult */ if (q->nsops > 1) return 1; - curr = sma->sem_base + q->sops[0].sem_num; + /* It is impossible that someone waits for the new value: + * - complex operations always restart. + * - wait-for-zero are handled seperately. + * - q is a previously sleeping simple operation that + * altered the array. It must be a decrement, because + * simple increments never sleep. + * - If there are older (higher priority) decrements + * in the queue, then they have observed the original + * semval value and couldn't proceed. The operation + * decremented to value - thus they won't proceed either. + */ + return 0; +} - /* No-one waits on this queue */ - if (list_empty(&curr->sem_pending)) - return 0; +/** + * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks + * @sma: semaphore array. + * @semnum: semaphore that was modified. + * @pt: list head for the tasks that must be woken up. + * + * wake_const_ops must be called after a semaphore in a semaphore array + * was set to 0. If complex const operations are pending, wake_const_ops must + * be called with semnum = -1, as well as with the number of each modified + * semaphore. + * The tasks that must be woken up are added to @pt. The return code + * is stored in q->pid. + * The function returns 1 if at least one operation was completed successfully. + */ +static int wake_const_ops(struct sem_array *sma, int semnum, + struct list_head *pt) +{ + struct sem_queue *q; + struct list_head *walk; + struct list_head *pending_list; + int semop_completed = 0; + + if (semnum == -1) + pending_list = &sma->pending_const; + else + pending_list = &sma->sem_base[semnum].pending_const; - /* the new semaphore value */ - if (curr->semval) { - /* It is impossible that someone waits for the new value: - * - q is a previously sleeping simple operation that - * altered the array. It must be a decrement, because - * simple increments never sleep. - * - The value is not 0, thus wait-for-zero won't proceed. - * - If there are older (higher priority) decrements - * in the queue, then they have observed the original - * semval value and couldn't proceed. The operation - * decremented to value - thus they won't proceed either. + walk = pending_list->next; + while (walk != pending_list) { + int error; + + q = container_of(walk, struct sem_queue, list); + walk = walk->next; + + error = try_atomic_semop(sma, q->sops, q->nsops, + q->undo, q->pid); + + if (error <= 0) { + /* operation completed, remove from queue & wakeup */ + + unlink_queue(sma, q); + + wake_up_sem_queue_prepare(pt, q, error); + if (error == 0) + semop_completed = 1; + } + } + return semop_completed; +} + +/** + * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks + * @sma: semaphore array + * @sops: operations that were performed + * @nsops: number of operations + * @pt: list head of the tasks that must be woken up. + * + * do_smart_wakeup_zero() checks all required queue for wait-for-zero + * operations, based on the actual changes that were performed on the + * semaphore array. + * The function returns 1 if at least one operation was completed successfully. + */ +static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, + int nsops, struct list_head *pt) +{ + int i; + int semop_completed = 0; + int got_zero = 0; + + /* first: the per-semaphore queues, if known */ + if (sops) { + for (i = 0; i < nsops; i++) { + int num = sops[i].sem_num; + + if (sma->sem_base[num].semval == 0) { + got_zero = 1; + semop_completed |= wake_const_ops(sma, num, pt); + } + } + } else { + /* + * No sops means modified semaphores not known. + * Assume all were changed. */ - BUG_ON(q->sops[0].sem_op >= 0); - return 0; + for (i = 0; i < sma->sem_nsems; i++) { + if (sma->sem_base[i].semval == 0) { + got_zero = 1; + semop_completed |= wake_const_ops(sma, i, pt); + } + } } /* - * semval is 0. Check if there are wait-for-zero semops. - * They must be the first entries in the per-semaphore queue + * If one of the modified semaphores got 0, + * then check the global queue, too. */ - h = list_first_entry(&curr->sem_pending, struct sem_queue, list); - BUG_ON(h->nsops != 1); - BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); + if (got_zero) + semop_completed |= wake_const_ops(sma, -1, pt); - /* Yes, there is a wait-for-zero semop. Restart */ - if (h->sops[0].sem_op == 0) - return 1; - - /* Again - no-one is waiting for the new value. */ - return 0; + return semop_completed; } @@ -678,6 +755,8 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) * semaphore. * The tasks that must be woken up are added to @pt. The return code * is stored in q->pid. + * The function internally checks if const operations can now succeed. + * * The function return 1 if at least one semop was completed successfully. */ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) @@ -688,9 +767,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) int semop_completed = 0; if (semnum == -1) - pending_list = &sma->sem_pending; + pending_list = &sma->pending_alter; else - pending_list = &sma->sem_base[semnum].sem_pending; + pending_list = &sma->sem_base[semnum].pending_alter; again: walk = pending_list->next; @@ -702,13 +781,12 @@ again: /* If we are scanning the single sop, per-semaphore list of * one semaphore and that semaphore is 0, then it is not - * necessary to scan the "alter" entries: simple increments + * necessary to scan further: simple increments * that affect only one entry succeed immediately and cannot * be in the per semaphore pending queue, and decrements * cannot be successful if the value is already 0. */ - if (semnum != -1 && sma->sem_base[semnum].semval == 0 && - q->alter) + if (semnum != -1 && sma->sem_base[semnum].semval == 0) break; error = try_atomic_semop(sma, q->sops, q->nsops, @@ -724,6 +802,7 @@ again: restart = 0; } else { semop_completed = 1; + do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); restart = check_restart(sma, q); } @@ -742,8 +821,8 @@ again: * @otime: force setting otime * @pt: list head of the tasks that must be woken up. * - * do_smart_update() does the required called to update_queue, based on the - * actual changes that were performed on the semaphore array. + * do_smart_update() does the required calls to update_queue and wakeup_zero, + * based on the actual changes that were performed on the semaphore array. * Note that the function does not do the actual wake-up: the caller is * responsible for calling wake_up_sem_queue_do(@pt). * It is safe to perform this call after dropping all locks. @@ -754,6 +833,8 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop int i; int progress; + otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); + progress = 1; retry_global: if (sma->complex_count) { @@ -813,14 +894,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum) struct sem_queue * q; semncnt = 0; - list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { + list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) { struct sembuf * sops = q->sops; BUG_ON(sops->sem_num != semnum); if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT)) semncnt++; } - list_for_each_entry(q, &sma->sem_pending, list) { + list_for_each_entry(q, &sma->pending_alter, list) { struct sembuf * sops = q->sops; int nsops = q->nsops; int i; @@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) struct sem_queue * q; semzcnt = 0; - list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) { + list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) { struct sembuf * sops = q->sops; BUG_ON(sops->sem_num != semnum); if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT)) semzcnt++; } - list_for_each_entry(q, &sma->sem_pending, list) { + list_for_each_entry(q, &sma->pending_const, list) { struct sembuf * sops = q->sops; int nsops = q->nsops; int i; @@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) /* Wake up all pending processes and let them fail with EIDRM. */ INIT_LIST_HEAD(&tasks); - list_for_each_entry_safe(q, tq, &sma->sem_pending, list) { + list_for_each_entry_safe(q, tq, &sma->pending_const, list) { + unlink_queue(sma, q); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + } + + list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } for (i = 0; i < sma->sem_nsems; i++) { struct sem *sem = sma->sem_base + i; - list_for_each_entry_safe(q, tq, &sem->sem_pending, list) { + list_for_each_entry_safe(q, tq, &sem->pending_const, list) { + unlink_queue(sma, q); + wake_up_sem_queue_prepare(&tasks, q, -EIDRM); + } + list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { unlink_queue(sma, q); wake_up_sem_queue_prepare(&tasks, q, -EIDRM); } @@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, curr = &sma->sem_base[sops->sem_num]; if (alter) - list_add_tail(&queue.list, &curr->sem_pending); + list_add_tail(&queue.list, &curr->pending_alter); else - list_add(&queue.list, &curr->sem_pending); + list_add_tail(&queue.list, &curr->pending_const); } else { if (alter) - list_add_tail(&queue.list, &sma->sem_pending); + list_add_tail(&queue.list, &sma->pending_alter); else - list_add(&queue.list, &sma->sem_pending); + list_add_tail(&queue.list, &sma->pending_const); + sma->complex_count++; } -- cgit v0.10.2 From f269f40ad5aeee229ed70044926f44318abe41ef Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:24 -0700 Subject: ipc/sem.c: always use only one queue for alter operations There are two places that can contain alter operations: - the global queue: sma->pending_alter - the per-semaphore queues: sma->sem_base[].pending_alter. Since one of the queues must be processed first, this causes an odd priorization of the wakeups: complex operations have priority over simple ops. The patch restores the behavior of linux <=3.0.9: The longest waiting operation has the highest priority. This is done by using only one queue: - if there are complex ops, then sma->pending_alter is used. - otherwise, the per-semaphore queues are used. As a side effect, do_smart_update_queue() becomes much simpler: no more goto logic. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/sem.c b/ipc/sem.c index 4d7f88c..6291257 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -192,6 +192,53 @@ void __init sem_init (void) IPC_SEM_IDS, sysvipc_sem_proc_show); } +/** + * unmerge_queues - unmerge queues, if possible. + * @sma: semaphore array + * + * The function unmerges the wait queues if complex_count is 0. + * It must be called prior to dropping the global semaphore array lock. + */ +static void unmerge_queues(struct sem_array *sma) +{ + struct sem_queue *q, *tq; + + /* complex operations still around? */ + if (sma->complex_count) + return; + /* + * We will switch back to simple mode. + * Move all pending operation back into the per-semaphore + * queues. + */ + list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { + struct sem *curr; + curr = &sma->sem_base[q->sops[0].sem_num]; + + list_add_tail(&q->list, &curr->pending_alter); + } + INIT_LIST_HEAD(&sma->pending_alter); +} + +/** + * merge_queues - Merge single semop queues into global queue + * @sma: semaphore array + * + * This function merges all per-semaphore queues into the global queue. + * It is necessary to achieve FIFO ordering for the pending single-sop + * operations when a multi-semop operation must sleep. + * Only the alter operations must be moved, the const operations can stay. + */ +static void merge_queues(struct sem_array *sma) +{ + int i; + for (i = 0; i < sma->sem_nsems; i++) { + struct sem *sem = sma->sem_base + i; + + list_splice_init(&sem->pending_alter, &sma->pending_alter); + } +} + /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. @@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, static inline void sem_unlock(struct sem_array *sma, int locknum) { if (locknum == -1) { + unmerge_queues(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; @@ -831,49 +879,38 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop int otime, struct list_head *pt) { int i; - int progress; otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); - progress = 1; -retry_global: - if (sma->complex_count) { - if (update_queue(sma, -1, pt)) { - progress = 1; - otime = 1; - sops = NULL; - } - } - if (!progress) - goto done; - - if (!sops) { - /* No semops; something special is going on. */ - for (i = 0; i < sma->sem_nsems; i++) { - if (update_queue(sma, i, pt)) { - otime = 1; - progress = 1; + if (!list_empty(&sma->pending_alter)) { + /* semaphore array uses the global queue - just process it. */ + otime |= update_queue(sma, -1, pt); + } else { + if (!sops) { + /* + * No sops, thus the modified semaphores are not + * known. Check all. + */ + for (i = 0; i < sma->sem_nsems; i++) + otime |= update_queue(sma, i, pt); + } else { + /* + * Check the semaphores that were increased: + * - No complex ops, thus all sleeping ops are + * decrease. + * - if we decreased the value, then any sleeping + * semaphore ops wont be able to run: If the + * previous value was too small, then the new + * value will be too small, too. + */ + for (i = 0; i < nsops; i++) { + if (sops[i].sem_op > 0) { + otime |= update_queue(sma, + sops[i].sem_num, pt); + } } } - goto done_checkretry; - } - - /* Check the semaphores that were modified. */ - for (i = 0; i < nsops; i++) { - if (sops[i].sem_op > 0 || - (sops[i].sem_op < 0 && - sma->sem_base[sops[i].sem_num].semval == 0)) - if (update_queue(sma, sops[i].sem_num, pt)) { - otime = 1; - progress = 1; - } - } -done_checkretry: - if (progress) { - progress = 0; - goto retry_global; } -done: if (otime) sma->sem_otime = get_seconds(); } @@ -1747,11 +1784,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, struct sem *curr; curr = &sma->sem_base[sops->sem_num]; - if (alter) - list_add_tail(&queue.list, &curr->pending_alter); - else + if (alter) { + if (sma->complex_count) { + list_add_tail(&queue.list, + &sma->pending_alter); + } else { + + list_add_tail(&queue.list, + &curr->pending_alter); + } + } else { list_add_tail(&queue.list, &curr->pending_const); + } } else { + if (!sma->complex_count) + merge_queues(sma); + if (alter) list_add_tail(&queue.list, &sma->pending_alter); else -- cgit v0.10.2 From d12e1e50e47e0900dbbf52237b7e171f4f15ea1e Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:25 -0700 Subject: ipc/sem.c: replace shared sem_otime with per-semaphore value sem_otime contains the time of the last semaphore operation that completed successfully. Every operation updates this value, thus access from multiple cpus can cause thrashing. Therefore the patch replaces the variable with a per-semaphore variable. The per-array sem_otime is only calculated when required. No performance improvement on a single-socket i3 - only important for larger systems. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sem.h b/include/linux/sem.h index 55e17f6..976ce3a 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -12,7 +12,6 @@ struct task_struct; struct sem_array { struct kern_ipc_perm ____cacheline_aligned_in_smp sem_perm; /* permissions .. see ipc.h */ - time_t sem_otime; /* last semop time */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct list_head pending_alter; /* pending operations */ diff --git a/ipc/sem.c b/ipc/sem.c index 6291257..51352e1 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -99,6 +99,7 @@ struct sem { /* that alter the semaphore */ struct list_head pending_const; /* pending single-sop operations */ /* that do not alter the semaphore*/ + time_t sem_otime; /* candidate for sem_otime */ } ____cacheline_aligned_in_smp; /* One queue for each sleeping process in the system. */ @@ -911,8 +912,14 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop } } } - if (otime) - sma->sem_otime = get_seconds(); + if (otime) { + if (sops == NULL) { + sma->sem_base[0].sem_otime = get_seconds(); + } else { + sma->sem_base[sops[0].sem_num].sem_otime = + get_seconds(); + } + } } @@ -1058,6 +1065,21 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, } } +static time_t get_semotime(struct sem_array *sma) +{ + int i; + time_t res; + + res = sma->sem_base[0].sem_otime; + for (i = 1; i < sma->sem_nsems; i++) { + time_t to = sma->sem_base[i].sem_otime; + + if (to > res) + res = to; + } + return res; +} + static int semctl_nolock(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) { @@ -1131,9 +1153,9 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, goto out_unlock; kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); - tbuf.sem_otime = sma->sem_otime; - tbuf.sem_ctime = sma->sem_ctime; - tbuf.sem_nsems = sma->sem_nsems; + tbuf.sem_otime = get_semotime(sma); + tbuf.sem_ctime = sma->sem_ctime; + tbuf.sem_nsems = sma->sem_nsems; rcu_read_unlock(); if (copy_semid_to_user(p, &tbuf, version)) return -EFAULT; @@ -2025,6 +2047,9 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) { struct user_namespace *user_ns = seq_user_ns(s); struct sem_array *sma = it; + time_t sem_otime; + + sem_otime = get_semotime(sma); return seq_printf(s, "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n", @@ -2036,7 +2061,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) from_kgid_munged(user_ns, sma->sem_perm.gid), from_kuid_munged(user_ns, sma->sem_perm.cuid), from_kgid_munged(user_ns, sma->sem_perm.cgid), - sma->sem_otime, + sem_otime, sma->sem_ctime); } #endif -- cgit v0.10.2 From 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 8 Jul 2013 16:01:26 -0700 Subject: ipc/sem.c: rename try_atomic_semop() to perform_atomic_semop(), docu update Cleanup: Some minor points that I noticed while writing the previous patches 1) The name try_atomic_semop() is misleading: The function performs the operation (if it is possible). 2) Some documentation updates. No real code change, a rename and documentation changes. Signed-off-by: Manfred Spraul Cc: Rik van Riel Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/sem.c b/ipc/sem.c index 51352e1..4108889 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */ /* - * linked list protection: + * Locking: * sem_undo.id_next, + * sem_array.complex_count, * sem_array.pending{_alter,_cont}, - * sem_array.sem_undo: sem_lock() for read/write + * sem_array.sem_undo: global sem_lock() for read/write * sem_undo.proc_next: only "current" is allowed to read/write that field. * + * sem_array.sem_base[i].pending_{const,alter}: + * global or semaphore sem_lock() for read/write */ #define sc_semmsl sem_ctls[0] @@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); } -/* - * Determine whether a sequence of semaphore operations would succeed - * all at once. Return 0 if yes, 1 if need to sleep, else return error code. +/** perform_atomic_semop - Perform (if possible) a semaphore operation + * @sma: semaphore array + * @sops: array with operations that should be checked + * @nsems: number of sops + * @un: undo array + * @pid: pid that did the change + * + * Returns 0 if the operation was possible. + * Returns 1 if the operation is impossible, the caller must sleep. + * Negative values are error codes. */ -static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops, +static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops, int nsops, struct sem_undo *un, int pid) { int result, sem_op; @@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_array *sma, int semnum, q = container_of(walk, struct sem_queue, list); walk = walk->next; - error = try_atomic_semop(sma, q->sops, q->nsops, - q->undo, q->pid); + error = perform_atomic_semop(sma, q->sops, q->nsops, + q->undo, q->pid); if (error <= 0) { /* operation completed, remove from queue & wakeup */ @@ -838,7 +848,7 @@ again: if (semnum != -1 && sma->sem_base[semnum].semval == 0) break; - error = try_atomic_semop(sma, q->sops, q->nsops, + error = perform_atomic_semop(sma, q->sops, q->nsops, q->undo, q->pid); /* Does q->sleeper still need to sleep? */ @@ -1686,7 +1696,6 @@ static int get_queue_result(struct sem_queue *q) return error; } - SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1784,7 +1793,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (un && un->semid == -1) goto out_unlock_free; - error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); + error = perform_atomic_semop(sma, sops, nsops, un, + task_tgid_vnr(current)); if (error <= 0) { if (alter && error == 0) do_smart_update(sma, sops, nsops, 1, &tasks); -- cgit v0.10.2 From 026dadad6b44f0469a475efb4cae48269d8848bd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 8 Jul 2013 16:01:27 -0700 Subject: mwave: fix info leak in mwave_ioctl() Smatch complains that on 64 bit systems, there is a hole in the MW_ABILITIES struct between ->component_count and ->component_list[]. It leaks stack information from the mwave_ioctl() function. I've added a memset() to initialize the struct to zero. Signed-off-by: Dan Carpenter Cc: Greg KH Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/char/mwave/tp3780i.c b/drivers/char/mwave/tp3780i.c index c689697..04e6d6a 100644 --- a/drivers/char/mwave/tp3780i.c +++ b/drivers/char/mwave/tp3780i.c @@ -479,6 +479,7 @@ int tp3780I_QueryAbilities(THINKPAD_BD_DATA * pBDData, MW_ABILITIES * pAbilities PRINTK_2(TRACE_TP3780I, "tp3780i::tp3780I_QueryAbilities entry pBDData %p\n", pBDData); + memset(pAbilities, 0, sizeof(*pAbilities)); /* fill out standard constant fields */ pAbilities->instr_per_sec = pBDData->rDspSettings.uIps; pAbilities->data_size = pBDData->rDspSettings.uDStoreSize; -- cgit v0.10.2 From 1d04f3c6ab6bbdc6187ba44b8a667a785b63c4f2 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:28 -0700 Subject: partitions/msdos.c: end-of-line whitespace and semicolon cleanup Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 7681cd2..9bf19e6 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -90,7 +90,7 @@ static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); - }; + } return ret; } @@ -142,7 +142,7 @@ static void parse_extended(struct parsed_partitions *state, return; if (!msdos_magic_present(data + 510)) - goto done; + goto done; p = (struct partition *) (data + 0x1be); @@ -155,7 +155,7 @@ static void parse_extended(struct parsed_partitions *state, * and OS/2 seems to use all four entries. */ - /* + /* * First process the data partition(s) */ for (i=0; i<4; i++, p++) { @@ -263,7 +263,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, } #if defined(CONFIG_BSD_DISKLABEL) -/* +/* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ @@ -294,7 +294,7 @@ static void parse_bsd(struct parsed_partitions *state, if (state->next == state->limit) break; - if (p->p_fstype == BSD_FS_UNUSED) + if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); @@ -441,7 +441,7 @@ static struct { {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; - + int msdos_partition(struct parsed_partitions *state) { sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; -- cgit v0.10.2 From 6ceea22bbbc84fcf6bf0913bb3db8a657e9002f6 Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:29 -0700 Subject: partitions: add aix lvm partition support files Add partitions/aix.h and partitions/aix.c. AIX LVM permits to make "logical volumes" which are made of multiple slices of multiple disks. The new code allows only access to the "logical volumes" which are made of one slice on the probed disk, a slice being a contiguous disk area. The code also detects "logical volumes" made of multiple slices on the probed disk, but can not describe them to the partition layer, because the partition layer generic code does not support that. When such non-contiguous "logical volumes" are detected, a diagnostic message is printed. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 75a54e1..4cebb2f 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -68,6 +68,17 @@ config ACORN_PARTITION_RISCIX of machines called RISCiX. If you say 'Y' here, Linux will be able to read disks partitioned under RISCiX. +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + config OSF_PARTITION bool "Alpha OSF partition support" if PARTITION_ADVANCED default y if ALPHA diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 03af8ea..2be4d7b 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BLOCK) := check.o obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 0000000..43be471 --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,293 @@ +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter + */ + +#include "check.h" +#include "aix.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount += copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kzalloc(sizeof(struct lv_info) * state->limit, GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + } + put_dev_sector(sect); + } + pvd = alloc_pvd(state, vgda_sector + 17); + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix > state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix == 1) { + cur_lv_ix = lv_ix; + next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", + n[i].name, lvip[i].pps_found); + kfree(pvd); + } + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/aix.h b/block/partitions/aix.h new file mode 100644 index 0000000..e0c66a9 --- /dev/null +++ b/block/partitions/aix.h @@ -0,0 +1 @@ +extern int aix_partition(struct parsed_partitions *state); -- cgit v0.10.2 From f8f066033b015a744065f6c7ed83741b4760376b Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Mon, 8 Jul 2013 16:01:30 -0700 Subject: partitions/msdos: enumerate also AIX LVM partitions Graft AIX partitions enumeration into partitions/msdos.c There is already a AIX disks detection logic in msdos.c. When an AIX disk has been found, and if configured to, call the aix partitions recognizer. This avoids removal of AIX disks protection from msdos.c, avoids code duplication, and ensures that AIX partitions enumeration is called before plain msdos partitions enumeration. Signed-off-by: Philippe De Muyter Cc: Karel Zak Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 9bf19e6..9123f25 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -23,6 +23,7 @@ #include "check.h" #include "msdos.h" #include "efi.h" +#include "aix.h" /* * Many architectures don't like unaligned accesses, while @@ -462,8 +463,12 @@ int msdos_partition(struct parsed_partitions *state) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; +#endif } if (!msdos_magic_present(data + 510)) { -- cgit v0.10.2 From 0efbee70890c992f31a7b294ac654ff6c62d51c5 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:31 -0700 Subject: reboot: remove -stable friendly PF_THREAD_BOUND define Remove the prior patch's #define for easier backporting to the stable releases. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/sys.c b/kernel/sys.c index 071de90..b882440 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,11 +362,6 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY PF_THREAD_BOUND -#endif - static void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ -- cgit v0.10.2 From 15d94b82565ebfb0cf27830b96e6cf5ed2d12a9a Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:32 -0700 Subject: reboot: move shutdown/reboot related functions to kernel/reboot.c This patch is preparatory. It moves reboot related syscall, etc functions from kernel/sys.c to kernel/reboot.c. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/Makefile b/kernel/Makefile index 271fd31..470839d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o cred.o \ + notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 0000000..37d2636 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,346 @@ +/* + * linux/kernel/reboot.c + * + * Copyright (C) 2013 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); +} + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + migrate_to_reboot_cpu(); + syscore_shutdown(); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + migrate_to_reboot_cpu(); + syscore_shutdown(); + printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(current); + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(pid_ns, cmd); + if (ret) + return ret; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ + char **argv; + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret; + + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + ret = -ENOMEM; + } + + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sys.c b/kernel/sys.c index b882440..771129b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); /* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -/* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * @@ -308,261 +294,6 @@ out_unlock: return retval; } -/** - * emergency_restart - reboot the system - * - * Without shutting down any hardware or taking any locks - * reboot the system. This is called when we know we are in - * trouble so this is our best effort to reboot. This is - * safe to call in interrupt context. - */ -void emergency_restart(void) -{ - kmsg_dump(KMSG_DUMP_EMERG); - machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ - blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); - system_state = SYSTEM_RESTART; - usermodehelper_disable(); - device_shutdown(); -} - -/** - * register_reboot_notifier - Register function to be called at reboot time - * @nb: Info about notifier function to be called - * - * Registers a function with the list of functions - * to be called at reboot time. - * - * Currently always returns zero, as blocking_notifier_chain_register() - * always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - * unregister_reboot_notifier - Unregister previously registered reboot notifier - * @nb: Hook to be unregistered - * - * Unregisters a previously registered reboot - * notifier function. - * - * Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -static void migrate_to_reboot_cpu(void) -{ - /* The boot cpu is always logical cpu 0 */ - int cpu = 0; - - cpu_hotplug_disable(); - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(cpu)) - cpu = cpumask_first(cpu_online_mask); - - /* Prevent races with other tasks migrating this task */ - current->flags |= PF_NO_SETAFFINITY; - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - * kernel_restart - reboot the system - * @cmd: pointer to buffer containing command to execute for restart - * or %NULL - * - * Shutdown everything and perform a clean reboot. - * This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ - kernel_restart_prepare(cmd); - migrate_to_reboot_cpu(); - syscore_shutdown(); - if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); - else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); - machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ - blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); - system_state = state; - usermodehelper_disable(); - device_shutdown(); -} -/** - * kernel_halt - halt the system - * - * Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ - kernel_shutdown_prepare(SYSTEM_HALT); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); - kmsg_dump(KMSG_DUMP_HALT); - machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - * kernel_power_off - power_off the system - * - * Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ - kernel_shutdown_prepare(SYSTEM_POWER_OFF); - if (pm_power_off_prepare) - pm_power_off_prepare(); - migrate_to_reboot_cpu(); - syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); - machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, - void __user *, arg) -{ - struct pid_namespace *pid_ns = task_active_pid_ns(current); - char buffer[256]; - int ret = 0; - - /* We only trust the superuser with rebooting the system. */ - if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) - return -EPERM; - - /* For safety, we require "magic" arguments. */ - if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && - magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) - return -EINVAL; - - /* - * If pid namespaces are enabled and the current task is in a child - * pid_namespace, the command is handled by reboot_pid_ns() which will - * call do_exit(). - */ - ret = reboot_pid_ns(pid_ns, cmd); - if (ret) - return ret; - - /* Instead of trying to make the power_off code look like - * halt when pm_power_off is not set do it the easy way. - */ - if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) - cmd = LINUX_REBOOT_CMD_HALT; - - mutex_lock(&reboot_mutex); - switch (cmd) { - case LINUX_REBOOT_CMD_RESTART: - kernel_restart(NULL); - break; - - case LINUX_REBOOT_CMD_CAD_ON: - C_A_D = 1; - break; - - case LINUX_REBOOT_CMD_CAD_OFF: - C_A_D = 0; - break; - - case LINUX_REBOOT_CMD_HALT: - kernel_halt(); - do_exit(0); - panic("cannot halt.\n"); - - case LINUX_REBOOT_CMD_POWER_OFF: - kernel_power_off(); - do_exit(0); - break; - - case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - ret = -EFAULT; - break; - } - buffer[sizeof(buffer) - 1] = '\0'; - - kernel_restart(buffer); - break; - -#ifdef CONFIG_KEXEC - case LINUX_REBOOT_CMD_KEXEC: - ret = kernel_kexec(); - break; -#endif - -#ifdef CONFIG_HIBERNATION - case LINUX_REBOOT_CMD_SW_SUSPEND: - ret = hibernate(); - break; -#endif - - default: - ret = -EINVAL; - break; - } - mutex_unlock(&reboot_mutex); - return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ - kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ - static DECLARE_WORK(cad_work, deferred_cad); - - if (C_A_D) - schedule_work(&cad_work); - else - kill_cad_pid(SIGINT, 1); -} - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -2287,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, return err ? -EFAULT : 0; } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ - char **argv; - static char *envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); - if (argv) { - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - argv_free(argv); - } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - ret = -ENOMEM; - } - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ - __orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ - if (force) /* do not override the pending "true" */ - poweroff_force = true; - schedule_work(&poweroff_work); - return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); - /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill -- cgit v0.10.2 From 972ee83df88a7fd84c228a31b4f9611299898984 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:34 -0700 Subject: reboot: checkpatch.pl the new kernel/reboot.c file Get the new file to pass scripts/checkpatch.pl Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 23b3630..c6eba21 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -26,7 +26,7 @@ extern void machine_shutdown(void); struct pt_regs; extern void machine_crash_shutdown(struct pt_regs *); -/* +/* * Architecture independent implemenations of sys_reboot commands. */ diff --git a/kernel/reboot.c b/kernel/reboot.c index 37d2636..abb6a04 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -4,6 +4,8 @@ * Copyright (C) 2013 Linus Torvalds */ +#define pr_fmt(fmt) "reboot: " fmt + #include #include #include @@ -114,9 +116,9 @@ void kernel_restart(char *cmd) migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) - printk(KERN_EMERG "Restarting system.\n"); + pr_emerg("Restarting system\n"); else - printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + pr_emerg("Restarting system with command '%s'\n", cmd); kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } @@ -125,7 +127,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, - (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); system_state = state; usermodehelper_disable(); device_shutdown(); @@ -140,11 +142,10 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "System halted.\n"); + pr_emerg("System halted\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } - EXPORT_SYMBOL_GPL(kernel_halt); /** @@ -159,7 +160,7 @@ void kernel_power_off(void) pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); - printk(KERN_EMERG "Power down.\n"); + pr_emerg("Power down\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } @@ -188,10 +189,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, /* For safety, we require "magic" arguments. */ if (magic1 != LINUX_REBOOT_MAGIC1 || - (magic2 != LINUX_REBOOT_MAGIC2 && - magic2 != LINUX_REBOOT_MAGIC2A && + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && magic2 != LINUX_REBOOT_MAGIC2B && - magic2 != LINUX_REBOOT_MAGIC2C)) + magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; /* @@ -234,7 +235,8 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, break; case LINUX_REBOOT_CMD_RESTART2: - if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); + if (ret < 0) { ret = -EFAULT; break; } @@ -300,14 +302,11 @@ static int __orderly_poweroff(bool force) ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); } else { - printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); ret = -ENOMEM; } if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); + pr_warn("Failed to start orderly shutdown: forcing the issue\n"); /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an -- cgit v0.10.2 From edf2b1394611fef7806d4af72179dc3ac101f275 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:35 -0700 Subject: reboot: x86: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Miguel Boton Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 76fa1e9..f770340 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,7 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; -static int reboot_mode; +static enum reboot_mode reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; @@ -88,11 +88,11 @@ static int __init reboot_setup(char *str) switch (*str) { case 'w': - reboot_mode = 0x1234; + reboot_mode = REBOOT_WARM; break; case 'c': - reboot_mode = 0; + reboot_mode = REBOOT_COLD; break; #ifdef CONFIG_SMP @@ -536,6 +536,7 @@ static void native_machine_emergency_restart(void) int i; int attempt = 0; int orig_reboot_type = reboot_type; + unsigned short mode; if (reboot_emergency) emergency_vmx_disable_all(); @@ -543,7 +544,8 @@ static void native_machine_emergency_restart(void) tboot_shutdown(TB_SHUTDOWN_REBOOT); /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; for (;;) { /* Could also try the reset bit in the Hammer NB */ @@ -585,7 +587,7 @@ static void native_machine_emergency_restart(void) case BOOT_EFI: if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi.reset_system(reboot_mode ? + efi.reset_system(reboot_mode == REBOOT_WARM ? EFI_RESET_WARM : EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); diff --git a/include/linux/reboot.h b/include/linux/reboot.h index c6eba21..37d56c3 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -10,6 +10,11 @@ #define SYS_HALT 0x0002 /* Notify of system halt */ #define SYS_POWER_OFF 0x0003 /* Notify of system power off */ +enum reboot_mode { + REBOOT_COLD = 0, + REBOOT_WARM, +}; + extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); -- cgit v0.10.2 From c97a7008517abb7c805fbdd49410032a652def26 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:36 -0700 Subject: reboot: unicore32: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: Guan Xuetao Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: H. Peter Anvin Acked-by: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index c944769..93dd035 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -51,14 +51,14 @@ void arch_cpu_idle(void) local_irq_enable(); } -static char reboot_mode = 'h'; +static enum reboot_mode reboot_mode = REBOOT_HARD; int __init reboot_setup(char *str) { - reboot_mode = str[0]; + if ('s' == str[0]) + reboot_mode = REBOOT_SOFT; return 1; } - __setup("reboot=", reboot_setup); void machine_halt(void) @@ -88,7 +88,7 @@ void machine_restart(char *cmd) * we may need it to insert some 1:1 mappings so that * soft boot works. */ - setup_mm_for_reboot(reboot_mode); + setup_mm_for_reboot(); /* Clean and invalidate caches */ flush_cache_all(); @@ -102,7 +102,7 @@ void machine_restart(char *cmd) /* * Now handle reboot code. */ - if (reboot_mode == 's') { + if (reboot_mode == REBOOT_SOFT) { /* Jump into ROM at address 0xffff0000 */ cpu_reset(VECTORS_BASE); } else { diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h index 30f749d..f5c51b8 100644 --- a/arch/unicore32/kernel/setup.h +++ b/arch/unicore32/kernel/setup.h @@ -22,7 +22,7 @@ extern void puv3_ps2_init(void); extern void pci_puv3_preinit(void); extern void __init puv3_init_gpio(void); -extern void setup_mm_for_reboot(char mode); +extern void setup_mm_for_reboot(void); extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index 43c20b4..4f5a532 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -445,7 +445,7 @@ void __init paging_init(void) * the user-mode pages. This will then ensure that we have predictable * results when turning the mmu off */ -void setup_mm_for_reboot(char mode) +void setup_mm_for_reboot(void) { unsigned long base_pmdval; pgd_t *pgd; diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 37d56c3..ca29a6f 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -13,6 +13,8 @@ enum reboot_mode { REBOOT_COLD = 0, REBOOT_WARM, + REBOOT_HARD, + REBOOT_SOFT, }; extern int register_reboot_notifier(struct notifier_block *); -- cgit v0.10.2 From 58591942789abe1ea18e3fb1e8d8502c70060c29 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:38 -0700 Subject: reboot: arm: remove unused restart_mode fields from some arm subarchs These restart_mode fields are not used at all. Remove them to make moving the reboot= cmdline options to the general kernel easier. Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 8a53f34..41d2d90 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -321,7 +321,6 @@ MACHINE_START(EBSA110, "EBSA110") .atag_offset = 0x400, .reserve_lp0 = 1, .reserve_lp2 = 1, - .restart_mode = 's', .map_io = ebsa110_map_io, .init_early = ebsa110_init_early, .init_irq = ebsa110_init_irq, diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c index 654b0ac..e6b0a93 100644 --- a/arch/arm/mach-pxa/mioa701.c +++ b/arch/arm/mach-pxa/mioa701.c @@ -761,7 +761,6 @@ static void mioa701_machine_exit(void) MACHINE_START(MIOA701, "MIO A701") .atag_offset = 0x100, - .restart_mode = 's', .map_io = &pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, .init_irq = &pxa27x_init_irq, diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 362726c..c3c0042 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -979,7 +979,6 @@ static void __init spitz_fixup(struct tag *tags, char **cmdline, #ifdef CONFIG_MACH_SPITZ MACHINE_START(SPITZ, "SHARP Spitz") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, @@ -993,7 +992,6 @@ MACHINE_END #ifdef CONFIG_MACH_BORZOI MACHINE_START(BORZOI, "SHARP Borzoi") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, @@ -1007,7 +1005,6 @@ MACHINE_END #ifdef CONFIG_MACH_AKITA MACHINE_START(AKITA, "SHARP Akita") - .restart_mode = 'g', .fixup = spitz_fixup, .map_io = pxa27x_map_io, .nr_irqs = PXA_NR_IRQS, diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index 3d91d2e..a41992f 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -969,7 +969,6 @@ static void __init fixup_tosa(struct tag *tags, char **cmdline, } MACHINE_START(TOSA, "SHARP Tosa") - .restart_mode = 'g', .fixup = fixup_tosa, .map_io = pxa25x_map_io, .nr_irqs = TOSA_NR_IRQS, -- cgit v0.10.2 From 16d6d5b00ee75307bab7e4ede9452c97b28f30e2 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:39 -0700 Subject: reboot: arm: prepare reboot_mode for moving to generic kernel code Prepare for the moving the parsing of reboot= to the generic kernel code by making reboot_mode into a more generic form. Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index 75bf079..fdf62b4 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -11,6 +11,7 @@ #include #ifndef __ASSEMBLY__ +#include struct tag; struct meminfo; @@ -43,7 +44,7 @@ struct machine_desc { unsigned char reserve_lp0 :1; /* never has lp0 */ unsigned char reserve_lp1 :1; /* never has lp1 */ unsigned char reserve_lp2 :1; /* never has lp2 */ - char restart_mode; /* default restart mode */ + enum reboot_mode reboot_mode; /* default restart mode */ struct smp_operations *smp; /* SMP operations */ bool (*smp_init)(void); void (*fixup)(struct tag *, char **, diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 7f1efcd..2d54406 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -175,14 +175,14 @@ void arch_cpu_idle(void) default_idle(); } -static char reboot_mode = 'h'; +enum reboot_mode reboot_mode = REBOOT_HARD; -int __init reboot_setup(char *str) +static int __init reboot_setup(char *str) { - reboot_mode = str[0]; + if ('s' == str[0]) + reboot_mode = REBOOT_SOFT; return 1; } - __setup("reboot=", reboot_setup); /* diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 9b65327..63af9a7 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -74,7 +74,7 @@ __setup("fpe=", fpe_setup); extern void paging_init(struct machine_desc *desc); extern void sanity_check_meminfo(void); -extern void reboot_setup(char *str); +extern enum reboot_mode reboot_mode; extern void setup_dma_zone(struct machine_desc *desc); unsigned int processor_id; @@ -861,8 +861,8 @@ void __init setup_arch(char **cmdline_p) setup_dma_zone(mdesc); - if (mdesc->restart_mode) - reboot_setup(&mdesc->restart_mode); + if (mdesc->reboot_mode != REBOOT_HARD) + reboot_mode = mdesc->reboot_mode; init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/arm/mach-footbridge/cats-hw.c b/arch/arm/mach-footbridge/cats-hw.c index 6987a09..9669cc0 100644 --- a/arch/arm/mach-footbridge/cats-hw.c +++ b/arch/arm/mach-footbridge/cats-hw.c @@ -86,7 +86,7 @@ fixup_cats(struct tag *tags, char **cmdline, struct meminfo *mi) MACHINE_START(CATS, "Chalice-CATS") /* Maintainer: Philip Blundell */ .atag_offset = 0x100, - .restart_mode = 's', + .reboot_mode = REBOOT_SOFT, .fixup = fixup_cats, .map_io = footbridge_map_io, .init_irq = footbridge_init_irq, -- cgit v0.10.2 From 7b6d864b48d95e6ea1df7df64475b9cb9616dcf9 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Mon, 8 Jul 2013 16:01:40 -0700 Subject: reboot: arm: change reboot_mode to use enum reboot_mode Preparing to move the parsing of reboot= to generic kernel code forces the change in reboot_mode handling to use the enum. [akpm@linux-foundation.org: fix arch/arm/mach-socfpga/socfpga.c] Signed-off-by: Robin Holt Cc: Russell King Cc: Russ Anderson Cc: Robin Holt Cc: H. Peter Anvin Cc: Guan Xuetao Acked-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/include/asm/hardware/iop3xx.h b/arch/arm/include/asm/hardware/iop3xx.h index ed94b1a..423744b 100644 --- a/arch/arm/include/asm/hardware/iop3xx.h +++ b/arch/arm/include/asm/hardware/iop3xx.h @@ -223,11 +223,12 @@ extern int iop3xx_get_init_atu(void); #ifndef __ASSEMBLY__ #include +#include void iop3xx_map_io(void); void iop_init_cp6_handler(void); void iop_init_time(unsigned long tickrate); -void iop3xx_restart(char, const char *); +void iop3xx_restart(enum reboot_mode, const char *); static inline u32 read_tmr0(void) { diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index fdf62b4..441efc4 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -59,7 +59,7 @@ struct machine_desc { #ifdef CONFIG_MULTI_IRQ_HANDLER void (*handle_irq)(struct pt_regs *); #endif - void (*restart)(char, const char *); + void (*restart)(enum reboot_mode, const char *); }; /* diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h index 21a23e3..a3d61ad 100644 --- a/arch/arm/include/asm/system_misc.h +++ b/arch/arm/include/asm/system_misc.h @@ -6,11 +6,12 @@ #include #include #include +#include extern void cpu_init(void); void soft_restart(unsigned long); -extern void (*arm_pm_restart)(char str, const char *cmd); +extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); extern void (*arm_pm_idle)(void); #define UDBG_UNDEFINED (1 << 0) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 2d54406..b7fdd86 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -113,7 +114,7 @@ void soft_restart(unsigned long addr) BUG(); } -static void null_restart(char mode, const char *cmd) +static void null_restart(enum reboot_mode reboot_mode, const char *cmd) { } @@ -123,7 +124,7 @@ static void null_restart(char mode, const char *cmd) void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -void (*arm_pm_restart)(char str, const char *cmd) = null_restart; +void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd) = null_restart; EXPORT_SYMBOL_GPL(arm_pm_restart); /* diff --git a/arch/arm/mach-at91/at91rm9200.c b/arch/arm/mach-at91/at91rm9200.c index 9eb5743..4aad93d 100644 --- a/arch/arm/mach-at91/at91rm9200.c +++ b/arch/arm/mach-at91/at91rm9200.c @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -304,7 +305,7 @@ static void at91rm9200_idle(void) at91_pmc_write(AT91_PMC_SCDR, AT91_PMC_PCK); } -static void at91rm9200_restart(char mode, const char *cmd) +static void at91rm9200_restart(enum reboot_mode reboot_mode, const char *cmd) { /* * Perform a hardware reset with the use of the Watchdog timer. diff --git a/arch/arm/mach-at91/generic.h b/arch/arm/mach-at91/generic.h index f6de36a..dc6e2f5 100644 --- a/arch/arm/mach-at91/generic.h +++ b/arch/arm/mach-at91/generic.h @@ -10,6 +10,7 @@ #include #include +#include /* Map io */ extern void __init at91_map_io(void); @@ -60,8 +61,8 @@ extern void at91sam9_idle(void); /* reset */ extern void at91_ioremap_rstc(u32 base_addr); -extern void at91sam9_alt_restart(char, const char *); -extern void at91sam9g45_restart(char, const char *); +extern void at91sam9_alt_restart(enum reboot_mode, const char *); +extern void at91sam9g45_restart(enum reboot_mode, const char *); /* shutdown */ extern void at91_ioremap_shdwc(u32 base_addr); diff --git a/arch/arm/mach-bcm2835/bcm2835.c b/arch/arm/mach-bcm2835/bcm2835.c index 740fa9e..40686d7 100644 --- a/arch/arm/mach-bcm2835/bcm2835.c +++ b/arch/arm/mach-bcm2835/bcm2835.c @@ -53,7 +53,7 @@ static void bcm2835_setup_restart(void) WARN(!wdt_regs, "failed to remap watchdog regs"); } -static void bcm2835_restart(char mode, const char *cmd) +static void bcm2835_restart(enum reboot_mode mode, const char *cmd) { u32 val; @@ -91,7 +91,7 @@ static void bcm2835_power_off(void) writel_relaxed(val, wdt_regs + PM_RSTS); /* Continue with normal reset mechanism */ - bcm2835_restart(0, ""); + bcm2835_restart(REBOOT_HARD, ""); } static struct map_desc io_map __initdata = { diff --git a/arch/arm/mach-clps711x/common.c b/arch/arm/mach-clps711x/common.c index f6d1746..4ca2f3c 100644 --- a/arch/arm/mach-clps711x/common.c +++ b/arch/arm/mach-clps711x/common.c @@ -384,7 +384,7 @@ void __init clps711x_timer_init(void) setup_irq(IRQ_TC2OI, &clps711x_timer_irq); } -void clps711x_restart(char mode, const char *cmd) +void clps711x_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0); } diff --git a/arch/arm/mach-clps711x/common.h b/arch/arm/mach-clps711x/common.h index 2a22f4c..9a6767b 100644 --- a/arch/arm/mach-clps711x/common.h +++ b/arch/arm/mach-clps711x/common.h @@ -4,6 +4,8 @@ * Common bits. */ +#include + #define CLPS711X_NR_IRQS (33) #define CLPS711X_NR_GPIO (4 * 8 + 3) #define CLPS711X_GPIO(prt, bit) ((prt) * 8 + (bit)) @@ -12,5 +14,5 @@ extern void clps711x_map_io(void); extern void clps711x_init_irq(void); extern void clps711x_timer_init(void); extern void clps711x_handle_irq(struct pt_regs *regs); -extern void clps711x_restart(char mode, const char *cmd); +extern void clps711x_restart(enum reboot_mode mode, const char *cmd); extern void clps711x_init_early(void); diff --git a/arch/arm/mach-cns3xxx/core.h b/arch/arm/mach-cns3xxx/core.h index b23b17b..5218b61 100644 --- a/arch/arm/mach-cns3xxx/core.h +++ b/arch/arm/mach-cns3xxx/core.h @@ -11,6 +11,8 @@ #ifndef __CNS3XXX_CORE_H #define __CNS3XXX_CORE_H +#include + extern void cns3xxx_timer_init(void); #ifdef CONFIG_CACHE_L2X0 @@ -22,6 +24,6 @@ static inline void cns3xxx_l2x0_init(void) {} void __init cns3xxx_map_io(void); void __init cns3xxx_init_irq(void); void cns3xxx_power_off(void); -void cns3xxx_restart(char, const char *); +void cns3xxx_restart(enum reboot_mode, const char *); #endif /* __CNS3XXX_CORE_H */ diff --git a/arch/arm/mach-cns3xxx/pm.c b/arch/arm/mach-cns3xxx/pm.c index 79e3d47..fb38c72 100644 --- a/arch/arm/mach-cns3xxx/pm.c +++ b/arch/arm/mach-cns3xxx/pm.c @@ -89,7 +89,7 @@ void cns3xxx_pwr_soft_rst(unsigned int block) } EXPORT_SYMBOL(cns3xxx_pwr_soft_rst); -void cns3xxx_restart(char mode, const char *cmd) +void cns3xxx_restart(enum reboot_mode mode, const char *cmd) { /* * To reset, we hit the on-board reset register diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c index eb254fe..71a46a3 100644 --- a/arch/arm/mach-davinci/devices-da8xx.c +++ b/arch/arm/mach-davinci/devices-da8xx.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -366,7 +367,7 @@ static struct platform_device da8xx_wdt_device = { .resource = da8xx_watchdog_resources, }; -void da8xx_restart(char mode, const char *cmd) +void da8xx_restart(enum reboot_mode mode, const char *cmd) { struct device *dev; diff --git a/arch/arm/mach-davinci/devices.c b/arch/arm/mach-davinci/devices.c index 90b83d0..111573c 100644 --- a/arch/arm/mach-davinci/devices.c +++ b/arch/arm/mach-davinci/devices.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -307,7 +308,7 @@ struct platform_device davinci_wdt_device = { .resource = wdt_resources, }; -void davinci_restart(char mode, const char *cmd) +void davinci_restart(enum reboot_mode mode, const char *cmd) { davinci_watchdog_reset(&davinci_wdt_device); } diff --git a/arch/arm/mach-davinci/include/mach/common.h b/arch/arm/mach-davinci/include/mach/common.h index b124b77..cce316b 100644 --- a/arch/arm/mach-davinci/include/mach/common.h +++ b/arch/arm/mach-davinci/include/mach/common.h @@ -14,6 +14,7 @@ #include #include +#include extern void davinci_timer_init(void); @@ -81,7 +82,7 @@ extern struct davinci_soc_info davinci_soc_info; extern void davinci_common_init(struct davinci_soc_info *soc_info); extern void davinci_init_ide(void); -void davinci_restart(char mode, const char *cmd); +void davinci_restart(enum reboot_mode mode, const char *cmd); void davinci_init_late(void); #ifdef CONFIG_DAVINCI_RESET_CLOCKS diff --git a/arch/arm/mach-davinci/include/mach/da8xx.h b/arch/arm/mach-davinci/include/mach/da8xx.h index 3c797e2..7b41a5e 100644 --- a/arch/arm/mach-davinci/include/mach/da8xx.h +++ b/arch/arm/mach-davinci/include/mach/da8xx.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -106,7 +107,7 @@ int da850_register_vpif_display (struct vpif_display_config *display_config); int da850_register_vpif_capture (struct vpif_capture_config *capture_config); -void da8xx_restart(char mode, const char *cmd); +void da8xx_restart(enum reboot_mode mode, const char *cmd); void da8xx_rproc_reserve_cma(void); int da8xx_register_rproc(void); diff --git a/arch/arm/mach-davinci/include/mach/tnetv107x.h b/arch/arm/mach-davinci/include/mach/tnetv107x.h index 366e975..16314c6 100644 --- a/arch/arm/mach-davinci/include/mach/tnetv107x.h +++ b/arch/arm/mach-davinci/include/mach/tnetv107x.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -54,7 +55,7 @@ extern struct platform_device tnetv107x_serial_device; extern void tnetv107x_init(void); extern void tnetv107x_devices_init(struct tnetv107x_device_info *); extern void tnetv107x_irq_init(void); -void tnetv107x_restart(char mode, const char *cmd); +void tnetv107x_restart(enum reboot_mode mode, const char *cmd); #endif diff --git a/arch/arm/mach-davinci/tnetv107x.c b/arch/arm/mach-davinci/tnetv107x.c index 3b2a70d..4545667 100644 --- a/arch/arm/mach-davinci/tnetv107x.c +++ b/arch/arm/mach-davinci/tnetv107x.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -730,7 +731,7 @@ static void tnetv107x_watchdog_reset(struct platform_device *pdev) __raw_writel(1, ®s->kick); } -void tnetv107x_restart(char mode, const char *cmd) +void tnetv107x_restart(enum reboot_mode mode, const char *cmd) { tnetv107x_watchdog_reset(&tnetv107x_wdt_device); } diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index 2a9443d..00247c7 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -381,7 +381,7 @@ void __init dove_init(void) dove_xor1_init(); } -void dove_restart(char mode, const char *cmd) +void dove_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-dove/common.h b/arch/arm/mach-dove/common.h index e863479..1d72522 100644 --- a/arch/arm/mach-dove/common.h +++ b/arch/arm/mach-dove/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_DOVE_COMMON_H #define __ARCH_DOVE_COMMON_H +#include + struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -42,6 +44,6 @@ void dove_spi1_init(void); void dove_i2c_init(void); void dove_sdio0_init(void); void dove_sdio1_init(void); -void dove_restart(char, const char *); +void dove_restart(enum reboot_mode, const char *); #endif diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 41d2d90..68ac934 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -311,7 +311,7 @@ static int __init ebsa110_init(void) arch_initcall(ebsa110_init); -static void ebsa110_restart(char mode, const char *cmd) +static void ebsa110_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0x80000000); } diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index c49ed3d..df8612f 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -921,7 +922,7 @@ void __init ep93xx_init_devices(void) gpio_led_register_device(-1, &ep93xx_led_data); } -void ep93xx_restart(char mode, const char *cmd) +void ep93xx_restart(enum reboot_mode mode, const char *cmd) { /* * Set then clear the SWRST bit to initiate a software reset diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h index a14e1b3..e256e0b 100644 --- a/arch/arm/mach-ep93xx/include/mach/platform.h +++ b/arch/arm/mach-ep93xx/include/mach/platform.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +#include + struct i2c_gpio_platform_data; struct i2c_board_info; struct spi_board_info; @@ -55,7 +57,7 @@ void ep93xx_ide_release_gpio(struct platform_device *pdev); void ep93xx_init_devices(void); extern void ep93xx_timer_init(void); -void ep93xx_restart(char, const char *); +void ep93xx_restart(enum reboot_mode, const char *); void ep93xx_init_late(void); #ifdef CONFIG_CRUNCH diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c index 2c655db..164685b 100644 --- a/arch/arm/mach-exynos/common.c +++ b/arch/arm/mach-exynos/common.c @@ -285,12 +285,12 @@ static struct map_desc exynos5440_iodesc0[] __initdata = { }, }; -void exynos4_restart(char mode, const char *cmd) +void exynos4_restart(enum reboot_mode mode, const char *cmd) { __raw_writel(0x1, S5P_SWRESET); } -void exynos5_restart(char mode, const char *cmd) +void exynos5_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; u32 val; diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h index 38d45fd..3e156bc 100644 --- a/arch/arm/mach-exynos/common.h +++ b/arch/arm/mach-exynos/common.h @@ -12,6 +12,7 @@ #ifndef __ARCH_ARM_MACH_EXYNOS_COMMON_H #define __ARCH_ARM_MACH_EXYNOS_COMMON_H +#include #include void mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1); @@ -20,8 +21,8 @@ extern unsigned long xxti_f, xusbxti_f; struct map_desc; void exynos_init_io(void); -void exynos4_restart(char mode, const char *cmd); -void exynos5_restart(char mode, const char *cmd); +void exynos4_restart(enum reboot_mode mode, const char *cmd); +void exynos5_restart(enum reboot_mode mode, const char *cmd); void exynos_init_late(void); /* ToDo: remove these after migrating legacy exynos4 platforms to dt */ diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index a42b369..2739ca2 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -198,9 +198,9 @@ void __init footbridge_map_io(void) } } -void footbridge_restart(char mode, const char *cmd) +void footbridge_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { /* Jump into the ROM */ soft_restart(0x41000000); } else { diff --git a/arch/arm/mach-footbridge/common.h b/arch/arm/mach-footbridge/common.h index a846e50..56607b3 100644 --- a/arch/arm/mach-footbridge/common.h +++ b/arch/arm/mach-footbridge/common.h @@ -1,3 +1,4 @@ +#include extern void footbridge_timer_init(void); extern void isa_timer_init(void); @@ -8,4 +9,4 @@ extern void footbridge_map_io(void); extern void footbridge_init_irq(void); extern void isa_init_irq(unsigned int irq); -extern void footbridge_restart(char, const char *); +extern void footbridge_restart(enum reboot_mode, const char *); diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index 90ea23f..1fd2cf0 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -634,9 +634,9 @@ fixup_netwinder(struct tag *tags, char **cmdline, struct meminfo *mi) #endif } -static void netwinder_restart(char mode, const char *cmd) +static void netwinder_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { /* Jump into the ROM */ soft_restart(0x41000000); } else { diff --git a/arch/arm/mach-highbank/core.h b/arch/arm/mach-highbank/core.h index 3f65206..aea1ec5 100644 --- a/arch/arm/mach-highbank/core.h +++ b/arch/arm/mach-highbank/core.h @@ -1,8 +1,10 @@ #ifndef __HIGHBANK_CORE_H #define __HIGHBANK_CORE_H +#include + extern void highbank_set_cpu_jump(int cpu, void *jump_addr); -extern void highbank_restart(char, const char *); +extern void highbank_restart(enum reboot_mode, const char *); extern void __iomem *scu_base_addr; #ifdef CONFIG_PM_SLEEP diff --git a/arch/arm/mach-highbank/system.c b/arch/arm/mach-highbank/system.c index 37d8384..2df5870 100644 --- a/arch/arm/mach-highbank/system.c +++ b/arch/arm/mach-highbank/system.c @@ -15,13 +15,14 @@ */ #include #include +#include #include "core.h" #include "sysregs.h" -void highbank_restart(char mode, const char *cmd) +void highbank_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 'h') + if (mode == REBOOT_HARD) highbank_set_pwr_hard_reset(); else highbank_set_pwr_soft_reset(); diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h index ee78847..cb6c838 100644 --- a/arch/arm/mach-imx/common.h +++ b/arch/arm/mach-imx/common.h @@ -11,6 +11,8 @@ #ifndef __ASM_ARCH_MXC_COMMON_H__ #define __ASM_ARCH_MXC_COMMON_H__ +#include + struct platform_device; struct pt_regs; struct clk; @@ -71,7 +73,7 @@ extern int mx53_clocks_init_dt(void); extern struct platform_device *mxc_register_gpio(char *name, int id, resource_size_t iobase, resource_size_t iosize, int irq, int irq_high); extern void mxc_set_cpu_type(unsigned int type); -extern void mxc_restart(char, const char *); +extern void mxc_restart(enum reboot_mode, const char *); extern void mxc_arch_reset_init(void __iomem *); extern void mxc_arch_reset_init_dt(void); extern int mx53_revision(void); diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c index f596522..7be13f8 100644 --- a/arch/arm/mach-imx/mach-imx6q.c +++ b/arch/arm/mach-imx/mach-imx6q.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -67,7 +68,7 @@ static void __init imx6q_init_revision(void) mxc_set_cpu_type(rev >> 16 & 0xff); } -static void imx6q_restart(char mode, const char *cmd) +static void imx6q_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; void __iomem *wdog_base; diff --git a/arch/arm/mach-imx/system.c b/arch/arm/mach-imx/system.c index 7cdc79a..6fe81bb 100644 --- a/arch/arm/mach-imx/system.c +++ b/arch/arm/mach-imx/system.c @@ -37,7 +37,7 @@ static struct clk *wdog_clk; /* * Reset the system. It is called by machine_restart(). */ -void mxc_restart(char mode, const char *cmd) +void mxc_restart(enum reboot_mode mode, const char *cmd) { unsigned int wcr_enable; diff --git a/arch/arm/mach-integrator/common.h b/arch/arm/mach-integrator/common.h index 72516658b..ad0ac55 100644 --- a/arch/arm/mach-integrator/common.h +++ b/arch/arm/mach-integrator/common.h @@ -1,7 +1,8 @@ +#include #include extern struct amba_pl010_data ap_uart_data; void integrator_init_early(void); int integrator_init(bool is_cp); void integrator_reserve(void); -void integrator_restart(char, const char *); +void integrator_restart(enum reboot_mode, const char *); void integrator_init_sysfs(struct device *parent, u32 id); diff --git a/arch/arm/mach-integrator/core.c b/arch/arm/mach-integrator/core.c index 81461d2..4cdfd73 100644 --- a/arch/arm/mach-integrator/core.c +++ b/arch/arm/mach-integrator/core.c @@ -124,7 +124,7 @@ void __init integrator_reserve(void) /* * To reset, we hit the on-board reset register in the system FPGA */ -void integrator_restart(char mode, const char *cmd) +void integrator_restart(enum reboot_mode mode, const char *cmd) { cm_control(CM_CTRL_RESET, CM_CTRL_RESET); } diff --git a/arch/arm/mach-iop13xx/include/mach/iop13xx.h b/arch/arm/mach-iop13xx/include/mach/iop13xx.h index 7480f58..17b4027 100644 --- a/arch/arm/mach-iop13xx/include/mach/iop13xx.h +++ b/arch/arm/mach-iop13xx/include/mach/iop13xx.h @@ -2,6 +2,9 @@ #define _IOP13XX_HW_H_ #ifndef __ASSEMBLY__ + +#include + /* The ATU offsets can change based on the strapping */ extern u32 iop13xx_atux_pmmr_offset; extern u32 iop13xx_atue_pmmr_offset; @@ -11,7 +14,7 @@ void iop13xx_map_io(void); void iop13xx_platform_init(void); void iop13xx_add_tpmi_devices(void); void iop13xx_init_irq(void); -void iop13xx_restart(char, const char *); +void iop13xx_restart(enum reboot_mode, const char *); /* CPUID CP6 R0 Page 0 */ static inline int iop13xx_cpu_id(void) diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c index 1c5bd76..96e6c7a 100644 --- a/arch/arm/mach-iop13xx/setup.c +++ b/arch/arm/mach-iop13xx/setup.c @@ -594,7 +594,7 @@ __setup("iop13xx_init_adma", iop13xx_init_adma_setup); __setup("iop13xx_init_uart", iop13xx_init_uart_setup); __setup("iop13xx_init_i2c", iop13xx_init_i2c_setup); -void iop13xx_restart(char mode, const char *cmd) +void iop13xx_restart(enum reboot_mode mode, const char *cmd) { /* * Reset the internal bus (warning both cores are reset) diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c index ea0984a..0691443 100644 --- a/arch/arm/mach-iop32x/n2100.c +++ b/arch/arm/mach-iop32x/n2100.c @@ -286,7 +286,7 @@ static void n2100_power_off(void) ; } -static void n2100_restart(char mode, const char *cmd) +static void n2100_restart(enum reboot_mode mode, const char *cmd) { gpio_line_set(N2100_HARDWARE_RESET, GPIO_LOW); gpio_line_config(N2100_HARDWARE_RESET, GPIO_OUT); diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 1f6c1fb..5327dec 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -531,9 +531,9 @@ static void __init ixp4xx_clockevent_init(void) 0xf, 0xfffffffe); } -void ixp4xx_restart(char mode, const char *cmd) +void ixp4xx_restart(enum reboot_mode mode, const char *cmd) { - if ( 1 && mode == 's') { + if ( 1 && mode == REBOOT_SOFT) { /* Jump into ROM at address 0 */ soft_restart(0); } else { diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c index 5d413f8..686ef34 100644 --- a/arch/arm/mach-ixp4xx/dsmg600-setup.c +++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm/mach-ixp4xx/include/mach/platform.h b/arch/arm/mach-ixp4xx/include/mach/platform.h index db5afb6..4c4c6a6 100644 --- a/arch/arm/mach-ixp4xx/include/mach/platform.h +++ b/arch/arm/mach-ixp4xx/include/mach/platform.h @@ -13,6 +13,8 @@ #ifndef __ASSEMBLY__ +#include + #include #ifndef __ARMEB__ @@ -123,7 +125,7 @@ extern void ixp4xx_init_early(void); extern void ixp4xx_init_irq(void); extern void ixp4xx_sys_init(void); extern void ixp4xx_timer_init(void); -extern void ixp4xx_restart(char, const char *); +extern void ixp4xx_restart(enum reboot_mode, const char *); extern void ixp4xx_pci_preinit(void); struct pci_sys_data; extern int ixp4xx_setup(int nr, struct pci_sys_data *sys); diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index 7c72c72..e9238b5 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -722,7 +723,7 @@ void __init kirkwood_init(void) #endif } -void kirkwood_restart(char mode, const char *cmd) +void kirkwood_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-kirkwood/common.h b/arch/arm/mach-kirkwood/common.h index 1c09f3f..fcf3ba6 100644 --- a/arch/arm/mach-kirkwood/common.h +++ b/arch/arm/mach-kirkwood/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_KIRKWOOD_COMMON_H #define __ARCH_KIRKWOOD_COMMON_H +#include + struct dsa_platform_data; struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -53,7 +55,7 @@ void kirkwood_audio_init(void); void kirkwood_cpuidle_init(void); void kirkwood_cpufreq_init(void); -void kirkwood_restart(char, const char *); +void kirkwood_restart(enum reboot_mode, const char *); void kirkwood_clk_init(void); /* board init functions for boards not fully converted to fdt */ diff --git a/arch/arm/mach-ks8695/generic.h b/arch/arm/mach-ks8695/generic.h index 6e97ce4..43253f8 100644 --- a/arch/arm/mach-ks8695/generic.h +++ b/arch/arm/mach-ks8695/generic.h @@ -12,5 +12,5 @@ extern __init void ks8695_map_io(void); extern __init void ks8695_init_irq(void); -extern void ks8695_restart(char, const char *); +extern void ks8695_restart(enum reboot_mode, const char *); extern void ks8695_timer_init(void); diff --git a/arch/arm/mach-ks8695/time.c b/arch/arm/mach-ks8695/time.c index c272a386..426c976 100644 --- a/arch/arm/mach-ks8695/time.c +++ b/arch/arm/mach-ks8695/time.c @@ -154,11 +154,11 @@ void __init ks8695_timer_init(void) setup_irq(KS8695_IRQ_TIMER1, &ks8695_timer_irq); } -void ks8695_restart(char mode, const char *cmd) +void ks8695_restart(enum reboot_mode reboot_mode, const char *cmd) { unsigned int reg; - if (mode == 's') + if (reboot_mode == REBOOT_SOFT) soft_restart(0); /* disable timer0 */ diff --git a/arch/arm/mach-lpc32xx/common.c b/arch/arm/mach-lpc32xx/common.c index 0d4db8c..d7aa54c 100644 --- a/arch/arm/mach-lpc32xx/common.c +++ b/arch/arm/mach-lpc32xx/common.c @@ -207,11 +207,11 @@ void __init lpc32xx_map_io(void) iotable_init(lpc32xx_io_desc, ARRAY_SIZE(lpc32xx_io_desc)); } -void lpc23xx_restart(char mode, const char *cmd) +void lpc23xx_restart(enum reboot_mode mode, const char *cmd) { switch (mode) { - case 's': - case 'h': + case REBOOT_SOFT: + case REBOOT_HARD: lpc32xx_watchdog_reset(); break; diff --git a/arch/arm/mach-lpc32xx/common.h b/arch/arm/mach-lpc32xx/common.h index e0b2606..1cd8853 100644 --- a/arch/arm/mach-lpc32xx/common.h +++ b/arch/arm/mach-lpc32xx/common.h @@ -21,6 +21,7 @@ #include #include +#include /* * Other arch specific structures and functions @@ -29,7 +30,7 @@ extern void lpc32xx_timer_init(void); extern void __init lpc32xx_init_irq(void); extern void __init lpc32xx_map_io(void); extern void __init lpc32xx_serial_init(void); -extern void lpc23xx_restart(char, const char *); +extern void lpc23xx_restart(enum reboot_mode, const char *); /* diff --git a/arch/arm/mach-mmp/common.c b/arch/arm/mach-mmp/common.c index 9292b79..c03b4ab 100644 --- a/arch/arm/mach-mmp/common.c +++ b/arch/arm/mach-mmp/common.c @@ -47,7 +47,7 @@ void __init mmp_map_io(void) mmp_chip_id = __raw_readl(MMP_CHIPID); } -void mmp_restart(char mode, const char *cmd) +void mmp_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0); } diff --git a/arch/arm/mach-mmp/common.h b/arch/arm/mach-mmp/common.h index 0bdc50b..991d7e9 100644 --- a/arch/arm/mach-mmp/common.h +++ b/arch/arm/mach-mmp/common.h @@ -1,10 +1,11 @@ +#include #define ARRAY_AND_SIZE(x) (x), ARRAY_SIZE(x) extern void timer_init(int irq); extern void __init icu_init_irq(void); extern void __init mmp_map_io(void); -extern void mmp_restart(char, const char *); +extern void mmp_restart(enum reboot_mode, const char *); extern void __init pxa168_clk_init(void); extern void __init pxa910_clk_init(void); extern void __init mmp2_clk_init(void); diff --git a/arch/arm/mach-mmp/include/mach/pxa168.h b/arch/arm/mach-mmp/include/mach/pxa168.h index 7ed1df2..459c2d0 100644 --- a/arch/arm/mach-mmp/include/mach/pxa168.h +++ b/arch/arm/mach-mmp/include/mach/pxa168.h @@ -1,9 +1,11 @@ #ifndef __ASM_MACH_PXA168_H #define __ASM_MACH_PXA168_H +#include + extern void pxa168_timer_init(void); extern void __init pxa168_init_irq(void); -extern void pxa168_restart(char, const char *); +extern void pxa168_restart(enum reboot_mode, const char *); extern void pxa168_clear_keypad_wakeup(void); #include diff --git a/arch/arm/mach-mmp/pxa168.c b/arch/arm/mach-mmp/pxa168.c index a30dcf3..144e997 100644 --- a/arch/arm/mach-mmp/pxa168.c +++ b/arch/arm/mach-mmp/pxa168.c @@ -172,7 +172,7 @@ int __init pxa168_add_usb_host(struct mv_usb_platform_data *pdata) return platform_device_register(&pxa168_device_usb_host); } -void pxa168_restart(char mode, const char *cmd) +void pxa168_restart(enum reboot_mode mode, const char *cmd) { soft_restart(0xffff0000); } diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index 749a7f8..75062ef 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -413,7 +413,7 @@ void __init mv78xx0_init(void) clk_init(); } -void mv78xx0_restart(char mode, const char *cmd) +void mv78xx0_restart(enum reboot_mode mode, const char *cmd) { /* * Enable soft reset to assert RSTOUTn. diff --git a/arch/arm/mach-mv78xx0/common.h b/arch/arm/mach-mv78xx0/common.h index 5e9485b..6889af2 100644 --- a/arch/arm/mach-mv78xx0/common.h +++ b/arch/arm/mach-mv78xx0/common.h @@ -11,6 +11,8 @@ #ifndef __ARCH_MV78XX0_COMMON_H #define __ARCH_MV78XX0_COMMON_H +#include + struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -45,7 +47,7 @@ void mv78xx0_uart1_init(void); void mv78xx0_uart2_init(void); void mv78xx0_uart3_init(void); void mv78xx0_i2c_init(void); -void mv78xx0_restart(char, const char *); +void mv78xx0_restart(enum reboot_mode, const char *); extern void mv78xx0_timer_init(void); diff --git a/arch/arm/mach-mvebu/common.h b/arch/arm/mach-mvebu/common.h index 98defd5..e366010 100644 --- a/arch/arm/mach-mvebu/common.h +++ b/arch/arm/mach-mvebu/common.h @@ -17,7 +17,9 @@ #define ARMADA_XP_MAX_CPUS 4 -void mvebu_restart(char mode, const char *cmd); +#include + +void mvebu_restart(enum reboot_mode mode, const char *cmd); void armada_370_xp_init_irq(void); void armada_370_xp_handle_irq(struct pt_regs *regs); diff --git a/arch/arm/mach-mvebu/system-controller.c b/arch/arm/mach-mvebu/system-controller.c index b8079df..f875124 100644 --- a/arch/arm/mach-mvebu/system-controller.c +++ b/arch/arm/mach-mvebu/system-controller.c @@ -26,6 +26,7 @@ #include #include #include +#include static void __iomem *system_controller_base; @@ -63,7 +64,7 @@ static struct of_device_id of_system_controller_table[] = { { /* end of list */ }, }; -void mvebu_restart(char mode, const char *cmd) +void mvebu_restart(enum reboot_mode mode, const char *cmd) { if (!system_controller_base) { pr_err("Cannot restart, system-controller not available: check the device tree\n"); diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c index 7fa611c..6298adb 100644 --- a/arch/arm/mach-mxs/mach-mxs.c +++ b/arch/arm/mach-mxs/mach-mxs.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -500,7 +501,7 @@ static void __init mxs_machine_init(void) /* * Reset the system. It is called by machine_restart(). */ -static void mxs_restart(char mode, const char *cmd) +static void mxs_restart(enum reboot_mode mode, const char *cmd) { struct device_node *np; void __iomem *reset_addr; diff --git a/arch/arm/mach-netx/generic.c b/arch/arm/mach-netx/generic.c index 1504b68..db25b0c 100644 --- a/arch/arm/mach-netx/generic.c +++ b/arch/arm/mach-netx/generic.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -187,7 +188,7 @@ static int __init netx_init(void) subsys_initcall(netx_init); -void netx_restart(char mode, const char *cmd) +void netx_restart(enum reboot_mode mode, const char *cmd) { writel(NETX_SYSTEM_RES_CR_FIRMW_RES_EN | NETX_SYSTEM_RES_CR_FIRMW_RES, NETX_SYSTEM_RES_CR); diff --git a/arch/arm/mach-netx/generic.h b/arch/arm/mach-netx/generic.h index 768b26b..bb2ce47 100644 --- a/arch/arm/mach-netx/generic.h +++ b/arch/arm/mach-netx/generic.h @@ -17,8 +17,10 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include + extern void __init netx_map_io(void); extern void __init netx_init_irq(void); -extern void netx_restart(char, const char *); +extern void netx_restart(enum reboot_mode, const char *); extern void netx_timer_init(void); diff --git a/arch/arm/mach-nomadik/cpu-8815.c b/arch/arm/mach-nomadik/cpu-8815.c index 2df209e..13e0df9 100644 --- a/arch/arm/mach-nomadik/cpu-8815.c +++ b/arch/arm/mach-nomadik/cpu-8815.c @@ -103,7 +103,7 @@ static void __init cpu8815_map_io(void) iotable_init(cpu8815_io_desc, ARRAY_SIZE(cpu8815_io_desc)); } -static void cpu8815_restart(char mode, const char *cmd) +static void cpu8815_restart(enum reboot_mode mode, const char *cmd) { void __iomem *srcbase = ioremap(NOMADIK_SRC_BASE, SZ_4K); diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c index 6c116e1..4677a9c 100644 --- a/arch/arm/mach-omap1/board-voiceblue.c +++ b/arch/arm/mach-omap1/board-voiceblue.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -215,7 +216,7 @@ void voiceblue_wdt_ping(void) gpio_set_value(0, wdt_gpio_state); } -static void voiceblue_restart(char mode, const char *cmd) +static void voiceblue_restart(enum reboot_mode mode, const char *cmd) { /* * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28 diff --git a/arch/arm/mach-omap1/common.h b/arch/arm/mach-omap1/common.h index 14f7e99..abec019 100644 --- a/arch/arm/mach-omap1/common.h +++ b/arch/arm/mach-omap1/common.h @@ -28,6 +28,7 @@ #include #include +#include #include @@ -70,7 +71,7 @@ static inline int omap_serial_wakeup_init(void) void omap1_init_early(void); void omap1_init_irq(void); void omap1_init_late(void); -void omap1_restart(char, const char *); +void omap1_restart(enum reboot_mode, const char *); extern void __init omap_check_revision(void); diff --git a/arch/arm/mach-omap1/reset.c b/arch/arm/mach-omap1/reset.c index 5eebd7e..72bf4bf 100644 --- a/arch/arm/mach-omap1/reset.c +++ b/arch/arm/mach-omap1/reset.c @@ -3,6 +3,7 @@ */ #include #include +#include #include @@ -22,7 +23,7 @@ #define OMAP_EXTWARM_RST_SRC_ID_SHIFT 5 -void omap1_restart(char mode, const char *cmd) +void omap1_restart(enum reboot_mode mode, const char *cmd) { /* * Workaround for 5912/1611b bug mentioned in sprz209d.pdf p. 28 diff --git a/arch/arm/mach-omap2/am33xx-restart.c b/arch/arm/mach-omap2/am33xx-restart.c index 88e4fa8..1eae962 100644 --- a/arch/arm/mach-omap2/am33xx-restart.c +++ b/arch/arm/mach-omap2/am33xx-restart.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ #include +#include #include "common.h" #include "prm-regbits-33xx.h" @@ -19,7 +20,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void am33xx_restart(char mode, const char *cmd) +void am33xx_restart(enum reboot_mode mode, const char *cmd) { /* TODO: Handle mode and cmd if necessary */ diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h index 72cab3f..dfcc182 100644 --- a/arch/arm/mach-omap2/common.h +++ b/arch/arm/mach-omap2/common.h @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -119,33 +120,33 @@ static inline void omap_soc_device_init(void) #endif #if defined(CONFIG_SOC_OMAP2420) || defined(CONFIG_SOC_OMAP2430) -void omap2xxx_restart(char mode, const char *cmd); +void omap2xxx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap2xxx_restart(char mode, const char *cmd) +static inline void omap2xxx_restart(enum reboot_mode mode, const char *cmd) { } #endif #ifdef CONFIG_SOC_AM33XX -void am33xx_restart(char mode, const char *cmd); +void am33xx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void am33xx_restart(char mode, const char *cmd) +static inline void am33xx_restart(enum reboot_mode mode, const char *cmd) { } #endif #ifdef CONFIG_ARCH_OMAP3 -void omap3xxx_restart(char mode, const char *cmd); +void omap3xxx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap3xxx_restart(char mode, const char *cmd) +static inline void omap3xxx_restart(enum reboot_mode mode, const char *cmd) { } #endif #if defined(CONFIG_ARCH_OMAP4) || defined(CONFIG_SOC_OMAP5) -void omap44xx_restart(char mode, const char *cmd); +void omap44xx_restart(enum reboot_mode mode, const char *cmd); #else -static inline void omap44xx_restart(char mode, const char *cmd) +static inline void omap44xx_restart(enum reboot_mode mode, const char *cmd) { } #endif diff --git a/arch/arm/mach-omap2/omap2-restart.c b/arch/arm/mach-omap2/omap2-restart.c index 719b716..68423e2 100644 --- a/arch/arm/mach-omap2/omap2-restart.c +++ b/arch/arm/mach-omap2/omap2-restart.c @@ -31,7 +31,7 @@ static struct clk *reset_virt_prcm_set_ck, *reset_sys_ck; * Set the DPLL to bypass so that reboot completes successfully. No * return value. */ -void omap2xxx_restart(char mode, const char *cmd) +void omap2xxx_restart(enum reboot_mode mode, const char *cmd) { u32 rate; diff --git a/arch/arm/mach-omap2/omap3-restart.c b/arch/arm/mach-omap2/omap3-restart.c index 923c582..5de2a0c 100644 --- a/arch/arm/mach-omap2/omap3-restart.c +++ b/arch/arm/mach-omap2/omap3-restart.c @@ -12,6 +12,7 @@ */ #include #include +#include #include "iomap.h" #include "common.h" @@ -28,7 +29,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void omap3xxx_restart(char mode, const char *cmd) +void omap3xxx_restart(enum reboot_mode mode, const char *cmd) { omap3_ctrl_write_boot_mode((cmd ? (u8)*cmd : 0)); omap3xxx_prm_dpll3_reset(); /* never returns */ diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c index 38cd3a6..5791143 100644 --- a/arch/arm/mach-omap2/omap4-common.c +++ b/arch/arm/mach-omap2/omap4-common.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm/mach-omap2/omap4-restart.c b/arch/arm/mach-omap2/omap4-restart.c index f90e02e..41dfd7d 100644 --- a/arch/arm/mach-omap2/omap4-restart.c +++ b/arch/arm/mach-omap2/omap4-restart.c @@ -8,6 +8,7 @@ */ #include +#include #include "prminst44xx.h" /** @@ -18,7 +19,7 @@ * Resets the SoC. For @cmd, see the 'reboot' syscall in * kernel/sys.c. No return value. */ -void omap44xx_restart(char mode, const char *cmd) +void omap44xx_restart(enum reboot_mode mode, const char *cmd) { /* XXX Should save 'cmd' into scratchpad for use after reboot */ omap4_prminst_global_warm_sw_reset(); /* never returns */ diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index f8a6db9..b41599f 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -347,7 +347,7 @@ void __init orion5x_init(void) orion5x_wdt_init(); } -void orion5x_restart(char mode, const char *cmd) +void orion5x_restart(enum reboot_mode mode, const char *cmd) { /* * Enable and issue soft reset diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h index cdaa01f..a909afb 100644 --- a/arch/arm/mach-orion5x/common.h +++ b/arch/arm/mach-orion5x/common.h @@ -1,6 +1,8 @@ #ifndef __ARCH_ORION5X_COMMON_H #define __ARCH_ORION5X_COMMON_H +#include + struct dsa_platform_data; struct mv643xx_eth_platform_data; struct mv_sata_platform_data; @@ -29,7 +31,7 @@ void orion5x_spi_init(void); void orion5x_uart0_init(void); void orion5x_uart1_init(void); void orion5x_xor_init(void); -void orion5x_restart(char, const char *); +void orion5x_restart(enum reboot_mode, const char *); /* * PCIe/PCI functions. diff --git a/arch/arm/mach-orion5x/ls-chl-setup.c b/arch/arm/mach-orion5x/ls-chl-setup.c index 24f4e14..6234977 100644 --- a/arch/arm/mach-orion5x/ls-chl-setup.c +++ b/arch/arm/mach-orion5x/ls-chl-setup.c @@ -139,7 +139,7 @@ static struct mv_sata_platform_data lschl_sata_data = { static void lschl_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } /***************************************************************************** diff --git a/arch/arm/mach-orion5x/ls_hgl-setup.c b/arch/arm/mach-orion5x/ls_hgl-setup.c index fc653bb..fe04c4b 100644 --- a/arch/arm/mach-orion5x/ls_hgl-setup.c +++ b/arch/arm/mach-orion5x/ls_hgl-setup.c @@ -185,7 +185,7 @@ static struct mv_sata_platform_data ls_hgl_sata_data = { static void ls_hgl_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } diff --git a/arch/arm/mach-orion5x/lsmini-setup.c b/arch/arm/mach-orion5x/lsmini-setup.c index 18e66e6..ca4dbe9 100644 --- a/arch/arm/mach-orion5x/lsmini-setup.c +++ b/arch/arm/mach-orion5x/lsmini-setup.c @@ -185,7 +185,7 @@ static struct mv_sata_platform_data lsmini_sata_data = { static void lsmini_power_off(void) { - orion5x_restart('h', NULL); + orion5x_restart(REBOOT_HARD, NULL); } diff --git a/arch/arm/mach-picoxcell/common.c b/arch/arm/mach-picoxcell/common.c index b13f51b..ec79fea 100644 --- a/arch/arm/mach-picoxcell/common.c +++ b/arch/arm/mach-picoxcell/common.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -63,7 +64,7 @@ static const char *picoxcell_dt_match[] = { NULL }; -static void picoxcell_wdt_restart(char mode, const char *cmd) +static void picoxcell_wdt_restart(enum reboot_mode mode, const char *cmd) { /* * Configure the watchdog to reset with the shortest possible timeout diff --git a/arch/arm/mach-prima2/common.h b/arch/arm/mach-prima2/common.h index 81135cd..a630485 100644 --- a/arch/arm/mach-prima2/common.h +++ b/arch/arm/mach-prima2/common.h @@ -10,6 +10,8 @@ #define __MACH_PRIMA2_COMMON_H__ #include +#include + #include #include @@ -22,7 +24,7 @@ extern void sirfsoc_cpu_die(unsigned int cpu); extern void __init sirfsoc_of_irq_init(void); extern void __init sirfsoc_of_clk_init(void); -extern void sirfsoc_restart(char, const char *); +extern void sirfsoc_restart(enum reboot_mode, const char *); extern asmlinkage void __exception_irq_entry sirfsoc_handle_irq(struct pt_regs *regs); #ifndef CONFIG_DEBUG_LL diff --git a/arch/arm/mach-prima2/rstc.c b/arch/arm/mach-prima2/rstc.c index d5e0cbc..ccb5339 100644 --- a/arch/arm/mach-prima2/rstc.c +++ b/arch/arm/mach-prima2/rstc.c @@ -13,6 +13,7 @@ #include #include #include +#include void __iomem *sirfsoc_rstc_base; static DEFINE_MUTEX(rstc_lock); @@ -84,7 +85,7 @@ int sirfsoc_reset_device(struct device *dev) #define SIRFSOC_SYS_RST_BIT BIT(31) -void sirfsoc_restart(char mode, const char *cmd) +void sirfsoc_restart(enum reboot_mode mode, const char *cmd) { writel(SIRFSOC_SYS_RST_BIT, sirfsoc_rstc_base); } diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index a5b8fead..f162f1b 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -663,16 +663,16 @@ static void corgi_poweroff(void) /* Green LED off tells the bootloader to halt */ gpio_set_value(CORGI_GPIO_LED_GREEN, 0); - pxa_restart('h', NULL); + pxa_restart(REBOOT_HARD, NULL); } -static void corgi_restart(char mode, const char *cmd) +static void corgi_restart(enum reboot_mode mode, const char *cmd) { if (!machine_is_corgi()) /* Green LED on tells the bootloader to reboot */ gpio_set_value(CORGI_GPIO_LED_GREEN, 1); - pxa_restart('h', cmd); + pxa_restart(REBOOT_HARD, cmd); } static void __init corgi_init(void) diff --git a/arch/arm/mach-pxa/generic.h b/arch/arm/mach-pxa/generic.h index fd7ea39..8963984 100644 --- a/arch/arm/mach-pxa/generic.h +++ b/arch/arm/mach-pxa/generic.h @@ -9,6 +9,8 @@ * published by the Free Software Foundation. */ +#include + struct irq_data; extern void pxa_timer_init(void); @@ -56,4 +58,4 @@ void __init pxa_set_btuart_info(void *info); void __init pxa_set_stuart_info(void *info); void __init pxa_set_hwuart_info(void *info); -void pxa_restart(char, const char *); +void pxa_restart(enum reboot_mode, const char *); diff --git a/arch/arm/mach-pxa/mioa701.c b/arch/arm/mach-pxa/mioa701.c index e6b0a93..acc9d3c 100644 --- a/arch/arm/mach-pxa/mioa701.c +++ b/arch/arm/mach-pxa/mioa701.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -696,13 +697,13 @@ static void mioa701_machine_exit(void); static void mioa701_poweroff(void) { mioa701_machine_exit(); - pxa_restart('s', NULL); + pxa_restart(REBOOT_SOFT, NULL); } -static void mioa701_restart(char c, const char *cmd) +static void mioa701_restart(enum reboot_mode c, const char *cmd) { mioa701_machine_exit(); - pxa_restart('s', cmd); + pxa_restart(REBOOT_SOFT, cmd); } static struct gpio global_gpios[] = { diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c index 50ccd5f..711d37e 100644 --- a/arch/arm/mach-pxa/poodle.c +++ b/arch/arm/mach-pxa/poodle.c @@ -422,7 +422,7 @@ static struct i2c_board_info __initdata poodle_i2c_devices[] = { static void poodle_poweroff(void) { - pxa_restart('h', NULL); + pxa_restart(REBOOT_HARD, NULL); } static void __init poodle_init(void) diff --git a/arch/arm/mach-pxa/reset.c b/arch/arm/mach-pxa/reset.c index 3fab583..0d5dd64 100644 --- a/arch/arm/mach-pxa/reset.c +++ b/arch/arm/mach-pxa/reset.c @@ -83,7 +83,7 @@ static void do_hw_reset(void) writel_relaxed(readl_relaxed(OSCR) + 368640, OSMR3); } -void pxa_restart(char mode, const char *cmd) +void pxa_restart(enum reboot_mode mode, const char *cmd) { local_irq_disable(); local_fiq_disable(); @@ -91,14 +91,14 @@ void pxa_restart(char mode, const char *cmd) clear_reset_status(RESET_STATUS_ALL); switch (mode) { - case 's': + case REBOOT_SOFT: /* Jump into ROM at address 0 */ soft_restart(0); break; - case 'g': + case REBOOT_GPIO: do_gpio_reset(); break; - case 'h': + case REBOOT_HARD: default: do_hw_reset(); break; diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index c3c0042..2125df0 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -924,10 +925,10 @@ static inline void spitz_i2c_init(void) {} ******************************************************************************/ static void spitz_poweroff(void) { - pxa_restart('g', NULL); + pxa_restart(REBOOT_GPIO, NULL); } -static void spitz_restart(char mode, const char *cmd) +static void spitz_restart(enum reboot_mode mode, const char *cmd) { uint32_t msc0 = __raw_readl(MSC0); /* Bootloader magic for a reboot */ diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index a41992f..0206b91 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -911,10 +912,10 @@ static struct platform_device *devices[] __initdata = { static void tosa_poweroff(void) { - pxa_restart('g', NULL); + pxa_restart(REBOOT_GPIO, NULL); } -static void tosa_restart(char mode, const char *cmd) +static void tosa_restart(enum reboot_mode mode, const char *cmd) { uint32_t msc0 = __raw_readl(MSC0); diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c index 5b1c8bf..c85ddb2 100644 --- a/arch/arm/mach-realview/realview_eb.c +++ b/arch/arm/mach-realview/realview_eb.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -418,7 +419,7 @@ static void __init realview_eb_timer_init(void) realview_eb_twd_init(); } -static void realview_eb_restart(char mode, const char *cmd) +static void realview_eb_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pb1176.c b/arch/arm/mach-realview/realview_pb1176.c index d5e83a1..c5eade7 100644 --- a/arch/arm/mach-realview/realview_pb1176.c +++ b/arch/arm/mach-realview/realview_pb1176.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -329,7 +330,7 @@ static void __init realview_pb1176_timer_init(void) realview_timer_init(IRQ_DC1176_TIMER0); } -static void realview_pb1176_restart(char mode, const char *cmd) +static void realview_pb1176_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pb11mp.c b/arch/arm/mach-realview/realview_pb11mp.c index c3cfe21..f4b0962 100644 --- a/arch/arm/mach-realview/realview_pb11mp.c +++ b/arch/arm/mach-realview/realview_pb11mp.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -316,7 +317,7 @@ static void __init realview_pb11mp_timer_init(void) realview_pb11mp_twd_init(); } -static void realview_pb11mp_restart(char mode, const char *cmd) +static void realview_pb11mp_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pba8.c b/arch/arm/mach-realview/realview_pba8.c index dde652a..10a3e1d 100644 --- a/arch/arm/mach-realview/realview_pba8.c +++ b/arch/arm/mach-realview/realview_pba8.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -264,7 +265,7 @@ static void __init realview_pba8_timer_init(void) realview_timer_init(IRQ_PBA8_TIMER0_1); } -static void realview_pba8_restart(char mode, const char *cmd) +static void realview_pba8_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-realview/realview_pbx.c b/arch/arm/mach-realview/realview_pbx.c index 54f0185..9d75493 100644 --- a/arch/arm/mach-realview/realview_pbx.c +++ b/arch/arm/mach-realview/realview_pbx.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -344,7 +345,7 @@ static void realview_pbx_fixup(struct tag *tags, char **from, #endif } -static void realview_pbx_restart(char mode, const char *cmd) +static void realview_pbx_restart(enum reboot_mode mode, const char *cmd) { void __iomem *reset_ctrl = __io_address(REALVIEW_SYS_RESETCTL); void __iomem *lock_ctrl = __io_address(REALVIEW_SYS_LOCK); diff --git a/arch/arm/mach-rpc/riscpc.c b/arch/arm/mach-rpc/riscpc.c index a302cf5..09d602b 100644 --- a/arch/arm/mach-rpc/riscpc.c +++ b/arch/arm/mach-rpc/riscpc.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -201,7 +202,7 @@ static int __init rpc_init(void) arch_initcall(rpc_init); -static void rpc_restart(char mode, const char *cmd) +static void rpc_restart(enum reboot_mode mode, const char *cmd) { iomd_writeb(0, IOMD_ROMCR0); diff --git a/arch/arm/mach-s3c24xx/common.h b/arch/arm/mach-s3c24xx/common.h index 307c371..84b2806 100644 --- a/arch/arm/mach-s3c24xx/common.h +++ b/arch/arm/mach-s3c24xx/common.h @@ -12,6 +12,8 @@ #ifndef __ARCH_ARM_MACH_S3C24XX_COMMON_H #define __ARCH_ARM_MACH_S3C24XX_COMMON_H __FILE__ +#include + struct s3c2410_uartcfg; #ifdef CONFIG_CPU_S3C2410 @@ -20,7 +22,7 @@ extern int s3c2410a_init(void); extern void s3c2410_map_io(void); extern void s3c2410_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2410_init_clocks(int xtal); -extern void s3c2410_restart(char mode, const char *cmd); +extern void s3c2410_restart(enum reboot_mode mode, const char *cmd); extern void s3c2410_init_irq(void); #else #define s3c2410_init_clocks NULL @@ -36,7 +38,7 @@ extern void s3c2412_map_io(void); extern void s3c2412_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2412_init_clocks(int xtal); extern int s3c2412_baseclk_add(void); -extern void s3c2412_restart(char mode, const char *cmd); +extern void s3c2412_restart(enum reboot_mode mode, const char *cmd); extern void s3c2412_init_irq(void); #else #define s3c2412_init_clocks NULL @@ -51,7 +53,7 @@ extern void s3c2416_map_io(void); extern void s3c2416_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2416_init_clocks(int xtal); extern int s3c2416_baseclk_add(void); -extern void s3c2416_restart(char mode, const char *cmd); +extern void s3c2416_restart(enum reboot_mode mode, const char *cmd); extern void s3c2416_init_irq(void); extern struct syscore_ops s3c2416_irq_syscore_ops; @@ -66,7 +68,7 @@ extern struct syscore_ops s3c2416_irq_syscore_ops; extern void s3c244x_map_io(void); extern void s3c244x_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c244x_init_clocks(int xtal); -extern void s3c244x_restart(char mode, const char *cmd); +extern void s3c244x_restart(enum reboot_mode mode, const char *cmd); #else #define s3c244x_init_clocks NULL #define s3c244x_init_uarts NULL @@ -96,7 +98,7 @@ extern void s3c2443_map_io(void); extern void s3c2443_init_uarts(struct s3c2410_uartcfg *cfg, int no); extern void s3c2443_init_clocks(int xtal); extern int s3c2443_baseclk_add(void); -extern void s3c2443_restart(char mode, const char *cmd); +extern void s3c2443_restart(enum reboot_mode mode, const char *cmd); extern void s3c2443_init_irq(void); #else #define s3c2443_init_clocks NULL diff --git a/arch/arm/mach-s3c24xx/s3c2410.c b/arch/arm/mach-s3c24xx/s3c2410.c index ff384ac..34676d1 100644 --- a/arch/arm/mach-s3c24xx/s3c2410.c +++ b/arch/arm/mach-s3c24xx/s3c2410.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -196,9 +197,9 @@ int __init s3c2410a_init(void) return s3c2410_init(); } -void s3c2410_restart(char mode, const char *cmd) +void s3c2410_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') { + if (mode == REBOOT_SOFT) { soft_restart(0); } diff --git a/arch/arm/mach-s3c24xx/s3c2412.c b/arch/arm/mach-s3c24xx/s3c2412.c index 0f864d4..0251650c 100644 --- a/arch/arm/mach-s3c24xx/s3c2412.c +++ b/arch/arm/mach-s3c24xx/s3c2412.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -129,9 +130,9 @@ static void s3c2412_idle(void) cpu_do_idle(); } -void s3c2412_restart(char mode, const char *cmd) +void s3c2412_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); /* errata "Watch-dog/Software Reset Problem" specifies that diff --git a/arch/arm/mach-s3c24xx/s3c2416.c b/arch/arm/mach-s3c24xx/s3c2416.c index b9c5d38..9ef3ccf 100644 --- a/arch/arm/mach-s3c24xx/s3c2416.c +++ b/arch/arm/mach-s3c24xx/s3c2416.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -79,9 +80,9 @@ static struct device s3c2416_dev = { .bus = &s3c2416_subsys, }; -void s3c2416_restart(char mode, const char *cmd) +void s3c2416_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); __raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST); diff --git a/arch/arm/mach-s3c24xx/s3c2443.c b/arch/arm/mach-s3c24xx/s3c2443.c index 8328cd6..b6c7191 100644 --- a/arch/arm/mach-s3c24xx/s3c2443.c +++ b/arch/arm/mach-s3c24xx/s3c2443.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -59,9 +60,9 @@ static struct device s3c2443_dev = { .bus = &s3c2443_subsys, }; -void s3c2443_restart(char mode, const char *cmd) +void s3c2443_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); __raw_writel(S3C2443_SWRST_RESET, S3C2443_SWRST); diff --git a/arch/arm/mach-s3c24xx/s3c244x.c b/arch/arm/mach-s3c24xx/s3c244x.c index d0423e2..911b555 100644 --- a/arch/arm/mach-s3c24xx/s3c244x.c +++ b/arch/arm/mach-s3c24xx/s3c244x.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -198,9 +199,9 @@ struct syscore_ops s3c244x_pm_syscore_ops = { .resume = s3c244x_resume, }; -void s3c244x_restart(char mode, const char *cmd) +void s3c244x_restart(enum reboot_mode mode, const char *cmd) { - if (mode == 's') + if (mode == REBOOT_SOFT) soft_restart(0); samsung_wdt_reset(); diff --git a/arch/arm/mach-s3c64xx/common.c b/arch/arm/mach-s3c64xx/common.c index 1aed6f4..3f62e46 100644 --- a/arch/arm/mach-s3c64xx/common.c +++ b/arch/arm/mach-s3c64xx/common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -381,9 +382,9 @@ static int __init s3c64xx_init_irq_eint(void) } arch_initcall(s3c64xx_init_irq_eint); -void s3c64xx_restart(char mode, const char *cmd) +void s3c64xx_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); /* if all else fails, or mode was for soft, jump to 0 */ diff --git a/arch/arm/mach-s3c64xx/common.h b/arch/arm/mach-s3c64xx/common.h index 6cfc99b..e8f990b 100644 --- a/arch/arm/mach-s3c64xx/common.h +++ b/arch/arm/mach-s3c64xx/common.h @@ -17,13 +17,15 @@ #ifndef __ARCH_ARM_MACH_S3C64XX_COMMON_H #define __ARCH_ARM_MACH_S3C64XX_COMMON_H +#include + void s3c64xx_init_irq(u32 vic0, u32 vic1); void s3c64xx_init_io(struct map_desc *mach_desc, int size); void s3c64xx_register_clocks(unsigned long xtal, unsigned armclk_limit); void s3c64xx_setup_clocks(void); -void s3c64xx_restart(char mode, const char *cmd); +void s3c64xx_restart(enum reboot_mode mode, const char *cmd); void s3c64xx_init_late(void); #ifdef CONFIG_CPU_S3C6400 diff --git a/arch/arm/mach-s5p64x0/common.c b/arch/arm/mach-s5p64x0/common.c index 76d0053..dfdfdc3 100644 --- a/arch/arm/mach-s5p64x0/common.c +++ b/arch/arm/mach-s5p64x0/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -439,9 +440,9 @@ static int __init s5p64x0_init_irq_eint(void) } arch_initcall(s5p64x0_init_irq_eint); -void s5p64x0_restart(char mode, const char *cmd) +void s5p64x0_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); soft_restart(0); diff --git a/arch/arm/mach-s5p64x0/common.h b/arch/arm/mach-s5p64x0/common.h index f8a60fd..f3a9b43 100644 --- a/arch/arm/mach-s5p64x0/common.h +++ b/arch/arm/mach-s5p64x0/common.h @@ -12,6 +12,8 @@ #ifndef __ARCH_ARM_MACH_S5P64X0_COMMON_H #define __ARCH_ARM_MACH_S5P64X0_COMMON_H +#include + void s5p6440_init_irq(void); void s5p6450_init_irq(void); void s5p64x0_init_io(struct map_desc *mach_desc, int size); @@ -22,7 +24,7 @@ void s5p6440_setup_clocks(void); void s5p6450_register_clocks(void); void s5p6450_setup_clocks(void); -void s5p64x0_restart(char mode, const char *cmd); +void s5p64x0_restart(enum reboot_mode mode, const char *cmd); #ifdef CONFIG_CPU_S5P6440 diff --git a/arch/arm/mach-s5pc100/common.c b/arch/arm/mach-s5pc100/common.c index 5110315..4bdfecf 100644 --- a/arch/arm/mach-s5pc100/common.c +++ b/arch/arm/mach-s5pc100/common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -217,9 +218,9 @@ void __init s5pc100_init_uarts(struct s3c2410_uartcfg *cfg, int no) s3c24xx_init_uartdevs("s3c6400-uart", s5p_uart_resources, cfg, no); } -void s5pc100_restart(char mode, const char *cmd) +void s5pc100_restart(enum reboot_mode mode, const char *cmd) { - if (mode != 's') + if (mode != REBOOT_SOFT) samsung_wdt_reset(); soft_restart(0); diff --git a/arch/arm/mach-s5pc100/common.h b/arch/arm/mach-s5pc100/common.h index c41f912..08d782d 100644 --- a/arch/arm/mach-s5pc100/common.h +++ b/arch/arm/mach-s5pc100/common.h @@ -12,13 +12,15 @@ #ifndef __ARCH_ARM_MACH_S5PC100_COMMON_H #define __ARCH_ARM_MACH_S5PC100_COMMON_H +#include + void s5pc100_init_io(struct map_desc *mach_desc, int size); void s5pc100_init_irq(void); void s5pc100_register_clocks(void); void s5pc100_setup_clocks(void); -void s5pc100_restart(char mode, const char *cmd); +void s5pc100_restart(enum reboot_mode mode, const char *cmd); extern int s5pc100_init(void); extern void s5pc100_map_io(void); diff --git a/arch/arm/mach-s5pv210/common.c b/arch/arm/mach-s5pv210/common.c index 9dfe93e..023f1a7 100644 --- a/arch/arm/mach-s5pv210/common.c +++ b/arch/arm/mach-s5pv210/common.c @@ -143,7 +143,7 @@ static struct map_desc s5pv210_iodesc[] __initdata = { } }; -void s5pv210_restart(char mode, const char *cmd) +void s5pv210_restart(enum reboot_mode mode, const char *cmd) { __raw_writel(0x1, S5P_SWRESET); } diff --git a/arch/arm/mach-s5pv210/common.h b/arch/arm/mach-s5pv210/common.h index 0a1cc0ae..fe1beb5 100644 --- a/arch/arm/mach-s5pv210/common.h +++ b/arch/arm/mach-s5pv210/common.h @@ -12,13 +12,15 @@ #ifndef __ARCH_ARM_MACH_S5PV210_COMMON_H #define __ARCH_ARM_MACH_S5PV210_COMMON_H +#include + void s5pv210_init_io(struct map_desc *mach_desc, int size); void s5pv210_init_irq(void); void s5pv210_register_clocks(void); void s5pv210_setup_clocks(void); -void s5pv210_restart(char mode, const char *cmd); +void s5pv210_restart(enum reboot_mode mode, const char *cmd); extern int s5pv210_init(void); extern void s5pv210_map_io(void); diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c index 9db3e98..f25b611 100644 --- a/arch/arm/mach-sa1100/generic.c +++ b/arch/arm/mach-sa1100/generic.c @@ -19,6 +19,7 @@ #include #include #include +#include #include