From 0cc482ee41d609811bd7b91abdbcd1c5dbdfb783 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Tue, 11 Oct 2016 13:51:01 -0700 Subject: ocfs2: fix memory leak in dlm_migrate_request_handler() In the dlm_migrate_request_handler(), when `ret' is -EEXIST, the mle should be freed, otherwise the memory will be leaked. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4A3D3522A@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Reviewed-by: Mark Fasheh Cc: Eric Ren Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 6ea06f8..3f828a1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -3188,6 +3188,9 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, migrate->new_master, migrate->master); + if (ret < 0) + kmem_cache_free(dlm_mle_cache, mle); + spin_unlock(&dlm->master_lock); unlock: spin_unlock(&dlm->spinlock); -- cgit v0.10.2 From 22dd6d356628bccb1a83e12212ec2934f4444e2c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 11 Oct 2016 13:51:05 -0700 Subject: block: invalidate the page cache when issuing BLKZEROOUT Patch series "fallocate for block devices", v11. This is a patchset to fix page cache coherency with BLKZEROOUT and implement fallocate for block devices. The first patch is a fix to the existing BLKZEROOUT ioctl to invalidate the page cache if the zeroing command to the underlying device succeeds. Without this patch we still have the pagecache coherence bug that's been in the kernel forever. The second patch changes the internal block device functions to reject attempts to discard or zeroout that are not aligned to the logical block size. Previously, we only checked that the start/len parameters were 512-byte aligned, which caused kernel BUG_ONs for unaligned IOs to 4k-LBA devices. The third patch creates an fallocate handler for block devices, wires up the FALLOC_FL_PUNCH_HOLE flag to zeroing-discard, and connects FALLOC_FL_ZERO_RANGE to write-same so that we can have a consistent fallocate interface between files and block devices. It also allows the combination of PUNCH_HOLE and NO_HIDE_STALE to invoke non-zeroing discard. Test cases for the new block device fallocate are now in xfstests as generic/349-351. This patch (of 3): Invalidate the page cache (as a regular O_DIRECT write would do) to avoid returning stale cache contents at a later time. Link: http://lkml.kernel.org/r/147518378313.22791.16649519283678515021.stgit@birch.djwong.org Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Theodore Ts'o Cc: Mike Snitzer Cc: Brian Foster Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/ioctl.c b/block/ioctl.c index ed2397f..755119c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -225,7 +225,8 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, unsigned long arg) { uint64_t range[2]; - uint64_t start, len; + struct address_space *mapping; + uint64_t start, end, len; if (!(mode & FMODE_WRITE)) return -EBADF; @@ -235,18 +236,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, start = range[0]; len = range[1]; + end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; - start >>= 9; - len >>= 9; - - if (start + len > (i_size_read(bdev->bd_inode) >> 9)) + if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + return -EINVAL; + if (end < start) return -EINVAL; - return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false); + /* Invalidate the page cache, including dirty pages */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); + + return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, + false); } static int put_ushort(unsigned long arg, unsigned short val) -- cgit v0.10.2 From 28b2be203e5a09de566d6f7e21183f861e36f07e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 11 Oct 2016 13:51:08 -0700 Subject: block: require write_same and discard requests align to logical block size Make sure that the offset and length arguments that we're using to construct WRITE SAME and DISCARD requests are actually aligned to the logical block size. Failure to do this causes other errors in other parts of the block layer or the SCSI layer because disks don't support partial logical block writes. Link: http://lkml.kernel.org/r/147518379026.22791.4437508871355153928.stgit@birch.djwong.org Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Cc: Theodore Ts'o Cc: Mike Snitzer # tweaked header Cc: Brian Foster Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/block/blk-lib.c b/block/blk-lib.c index 083e56f..46fe924 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -31,6 +31,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, unsigned int granularity; enum req_op op; int alignment; + sector_t bs_mask; if (!q) return -ENXIO; @@ -50,6 +51,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, op = REQ_OP_DISCARD; } + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; @@ -150,10 +155,15 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, unsigned int max_write_same_sectors; struct bio *bio = NULL; int ret = 0; + sector_t bs_mask; if (!q) return -ENXIO; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; @@ -202,6 +212,11 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, int ret; struct bio *bio = NULL; unsigned int sz; + sector_t bs_mask; + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; while (nr_sects != 0) { bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), -- cgit v0.10.2 From 25f4c41415e513f0e9fb1f3fce2ce98fcba8d263 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 11 Oct 2016 13:51:11 -0700 Subject: block: implement (some of) fallocate for block devices After much discussion, it seems that the fallocate feature flag FALLOC_FL_ZERO_RANGE maps nicely to SCSI WRITE SAME; and the feature FALLOC_FL_PUNCH_HOLE maps nicely to the devices that have been whitelisted for zeroing SCSI UNMAP. Punch still requires that FALLOC_FL_KEEP_SIZE is set. A length that goes past the end of the device will be clamped to the device size if KEEP_SIZE is set; or will return -EINVAL if not. Both start and length must be aligned to the device's logical block size. Since the semantics of fallocate are fairly well established already, wire up the two pieces. The other fallocate variants (collapse range, insert range, and allocate blocks) are not supported. Link: http://lkml.kernel.org/r/147518379992.22791.8849838163218235007.stgit@birch.djwong.org Signed-off-by: Darrick J. Wong Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Cc: Theodore Ts'o Cc: Martin K. Petersen Cc: Mike Snitzer # tweaked header Cc: Brian Foster Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/block_dev.c b/fs/block_dev.c index 376e4e4..05b5533 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1775,6 +1776,81 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct request_queue *q = bdev_get_queue(bdev); + struct address_space *mapping; + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages. */ + mapping = bdev->bd_inode->i_mapping; + truncate_inode_pages_range(mapping, start, end); + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, false); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + /* Only punch if the device can do zeroing discard. */ + if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + return -EOPNOTSUPP; + } + if (error) + return error; + + /* + * Invalidate again; if someone wandered in and dirtied a page, + * the caller will be given -EBUSY. The third argument is + * inclusive, so the rounding here is safe. + */ + return invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, @@ -1789,6 +1865,7 @@ const struct file_operations def_blk_fops = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/open.c b/fs/open.c index 8aeb08b..a7719cf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -300,7 +300,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * Let individual file system decide if it supports preallocation * for directories or not. */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && + !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wrap through zero too */ -- cgit v0.10.2 From 2d19309cf86883f634a4f8ec55a54bda87db19bf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 11 Oct 2016 13:51:14 -0700 Subject: fs/select: add vmalloc fallback for select(2) The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows with the number of fds passed. We had a customer report page allocation failures of order-4 for this allocation. This is a costly order, so it might easily fail, as the VM expects such allocation to have a lower-order fallback. Such trivial fallback is vmalloc(), as the memory doesn't have to be physically contiguous and the allocation is temporary for the duration of the syscall only. There were some concerns, whether this would have negative impact on the system by exposing vmalloc() to userspace. Although an excessive use of vmalloc can cause some system wide performance issues - TLB flushes etc. - a large order allocation is not for free either and an excessive reclaim/compaction can have a similar effect. Also note that the size is effectively limited by RLIMIT_NOFILE which defaults to 1024 on the systems I checked. That means the bitmaps will fit well within single page and thus the vmalloc() fallback could be only excercised for processes where root allows a higher limit. Note that the poll(2) syscall seems to use a linked list of order-0 pages, so it doesn't need this kind of fallback. [eric.dumazet@gmail.com: fix failure path logic] [akpm@linux-foundation.org: use proper type for size] Link: http://lkml.kernel.org/r/20160927084536.5923-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Alexander Viro Cc: Eric Dumazet Cc: David Laight Cc: Hillf Danton Cc: Nicholas Piggin Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/select.c b/fs/select.c index 8ed9da5..3d4f85d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -554,7 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set_bits fds; void *bits; int ret, max_fds; - unsigned int size; + size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -581,7 +582,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + if (size > (SIZE_MAX / 6)) + goto out_nofds; + + alloc_size = 6 * size; + bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); + if (!bits && alloc_size > PAGE_SIZE) + bits = vmalloc(alloc_size); + if (!bits) goto out_nofds; } @@ -618,7 +626,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, out: if (bits != stack_fds) - kfree(bits); + kvfree(bits); out_nofds: return ret; } -- cgit v0.10.2 From 915045fe15a5fc376f263d594aee4fca4fba5323 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 11 Oct 2016 13:51:18 -0700 Subject: radix-tree: 'slot' can be NULL in radix_tree_next_slot() There are four cases I can see where we could end up with a NULL 'slot' in radix_tree_next_slot(). Yet radix_tree_next_slot() never actually checks whether 'slot' is NULL. It just happens that for the cases where 'slot' is NULL, some other combination of factors prevents us from dereferencing it. It would be very easy for someone to unwittingly change one of these factors without realizing that we are implicitly depending on it to save us from a NULL pointer dereference. Add a comment documenting the things that allow 'slot' to be safely passed as NULL to radix_tree_next_slot(). Here are details on the four cases: 1) radix_tree_iter_retry() via a non-tagged iteration like radix_tree_for_each_slot(). In this case we currently aren't seeing a bug because radix_tree_iter_retry() sets iter->next_index = iter->index; which means that in in the else case in radix_tree_next_slot(), 'count' is zero, so we skip over the while() loop and effectively just return NULL without ever dereferencing 'slot'. 2) radix_tree_iter_retry() via tagged iteration like radix_tree_for_each_tagged(). This case was giving us NULL pointer dereferences in testing, and was fixed with this commit: commit 3cb9185c6730 ("radix-tree: fix radix_tree_iter_retry() for tagged iterators.") This fix doesn't explicitly check for 'slot' being NULL, though, it works around the NULL pointer dereference by instead zeroing iter->tags in radix_tree_iter_retry(), which makes us bail out of the if() case in radix_tree_next_slot() before we dereference 'slot'. 3) radix_tree_iter_next() via via a non-tagged iteration like radix_tree_for_each_slot(). This currently happens in shmem_tag_pins() and shmem_partial_swap_usage(). As with non-tagged iteration, 'count' in the else case of radix_tree_next_slot() is zero, so we skip over the while() loop and effectively just return NULL without ever dereferencing 'slot'. 4) radix_tree_iter_next() via tagged iteration like radix_tree_for_each_tagged(). This happens in shmem_wait_for_pins(). radix_tree_iter_next() zeros out iter->tags, so we end up exiting radix_tree_next_slot() here: if (flags & RADIX_TREE_ITER_TAGGED) { void *canon = slot; iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; Link: http://lkml.kernel.org/r/20160815194237.25967-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 52b97db..af3581b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -461,6 +461,14 @@ static inline struct radix_tree_node *entry_to_node(void *ptr) * * This function updates @iter->index in the case of a successful lookup. * For tagged lookup it also eats @iter->tags. + * + * There are several cases where 'slot' can be passed in as NULL to this + * function. These cases result from the use of radix_tree_iter_next() or + * radix_tree_iter_retry(). In these cases we don't end up dereferencing + * 'slot' because either: + * a) we are doing tagged iteration and iter->tags has been set to 0, or + * b) we are doing non-tagged iteration, and iter->index and iter->next_index + * have been set up so that radix_tree_chunk_size() returns 1 or 0. */ static __always_inline void ** radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) -- cgit v0.10.2 From eec4852543e4e6edbb6cab512fd1edc70c1f7a18 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 11 Oct 2016 13:51:21 -0700 Subject: radix-tree tests: add iteration test There are four cases I can see where we could end up with a NULL 'slot' in radix_tree_next_slot(). This unit test exercises all four of them, making sure that if in the future we have an unsafe path through radix_tree_next_slot(), we'll catch it. Here are details on the four cases: 1) radix_tree_iter_retry() via a non-tagged iteration like radix_tree_for_each_slot(). In this case we currently aren't seeing a bug because radix_tree_iter_retry() sets iter->next_index = iter->index; which means that in in the else case in radix_tree_next_slot(), 'count' is zero, so we skip over the while() loop and effectively just return NULL without ever dereferencing 'slot'. 2) radix_tree_iter_retry() via tagged iteration like radix_tree_for_each_tagged(). This case was giving us NULL pointer dereferences in testing, and was fixed with this commit: commit 3cb9185c6730 ("radix-tree: fix radix_tree_iter_retry() for tagged iterators.") This fix doesn't explicitly check for 'slot' being NULL, though, it works around the NULL pointer dereference by instead zeroing iter->tags in radix_tree_iter_retry(), which makes us bail out of the if() case in radix_tree_next_slot() before we dereference 'slot'. 3) radix_tree_iter_next() via via a non-tagged iteration like radix_tree_for_each_slot(). This currently happens in shmem_tag_pins() and shmem_partial_swap_usage(). As with non-tagged iteration, 'count' in the else case of radix_tree_next_slot() is zero, so we skip over the while() loop and effectively just return NULL without ever dereferencing 'slot'. 4) radix_tree_iter_next() via tagged iteration like radix_tree_for_each_tagged(). This happens in shmem_wait_for_pins(). radix_tree_iter_next() zeros out iter->tags, so we end up exiting radix_tree_next_slot() here: if (flags & RADIX_TREE_ITER_TAGGED) { void *canon = slot; iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; Link: http://lkml.kernel.org/r/20160815194237.25967-3-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 9d0919ed..f2e07f2 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -3,7 +3,8 @@ CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE LDFLAGS += -lpthread -lurcu TARGETS = main OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ - regression1.o regression2.o regression3.o multiorder.o + regression1.o regression2.o regression3.o multiorder.o \ + iteration_check.o targets: $(TARGETS) diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c new file mode 100644 index 0000000..9adb8e7 --- /dev/null +++ b/tools/testing/radix-tree/iteration_check.c @@ -0,0 +1,180 @@ +/* + * iteration_check.c: test races having to do with radix tree iteration + * Copyright (c) 2016 Intel Corporation + * Author: Ross Zwisler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include "test.h" + +#define NUM_THREADS 4 +#define TAG 0 +static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_t threads[NUM_THREADS]; +RADIX_TREE(tree, GFP_KERNEL); +bool test_complete; + +/* relentlessly fill the tree with tagged entries */ +static void *add_entries_fn(void *arg) +{ + int pgoff; + + while (!test_complete) { + for (pgoff = 0; pgoff < 100; pgoff++) { + pthread_mutex_lock(&tree_lock); + if (item_insert(&tree, pgoff) == 0) + item_tag_set(&tree, pgoff, TAG); + pthread_mutex_unlock(&tree_lock); + } + } + + return NULL; +} + +/* + * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find + * things that have been removed and randomly resetting our iteration to the + * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and + * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * NULL 'slot' variable. + */ +static void *tagged_iteration_fn(void *arg) +{ + struct radix_tree_iter iter; + void **slot; + + while (!test_complete) { + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { + void *entry; + int i; + + /* busy wait to let removals happen */ + for (i = 0; i < 1000000; i++) + ; + + entry = radix_tree_deref_slot(slot); + if (unlikely(!entry)) + continue; + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (rand() % 50 == 0) + slot = radix_tree_iter_next(&iter); + } + rcu_read_unlock(); + } + + return NULL; +} + +/* + * Iterate over the entries, doing a radix_tree_iter_retry() as we find things + * that have been removed and randomly resetting our iteration to the next + * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and + * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * NULL 'slot' variable. + */ +static void *untagged_iteration_fn(void *arg) +{ + struct radix_tree_iter iter; + void **slot; + + while (!test_complete) { + rcu_read_lock(); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + void *entry; + int i; + + /* busy wait to let removals happen */ + for (i = 0; i < 1000000; i++) + ; + + entry = radix_tree_deref_slot(slot); + if (unlikely(!entry)) + continue; + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + if (rand() % 50 == 0) + slot = radix_tree_iter_next(&iter); + } + rcu_read_unlock(); + } + + return NULL; +} + +/* + * Randomly remove entries to help induce radix_tree_iter_retry() calls in the + * two iteration functions. + */ +static void *remove_entries_fn(void *arg) +{ + while (!test_complete) { + int pgoff; + + pgoff = rand() % 100; + + pthread_mutex_lock(&tree_lock); + item_delete(&tree, pgoff); + pthread_mutex_unlock(&tree_lock); + } + + return NULL; +} + +/* This is a unit test for a bug found by the syzkaller tester */ +void iteration_test(void) +{ + int i; + + printf("Running iteration tests for 10 seconds\n"); + + srand(time(0)); + test_complete = false; + + if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { + perror("pthread_create"); + exit(1); + } + if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { + perror("pthread_create"); + exit(1); + } + if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { + perror("pthread_create"); + exit(1); + } + if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { + perror("pthread_create"); + exit(1); + } + + sleep(10); + test_complete = true; + + for (i = 0; i < NUM_THREADS; i++) { + if (pthread_join(threads[i], NULL)) { + perror("pthread_join"); + exit(1); + } + } + + item_kill_tree(&tree); +} diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index b7619ff..daa9010 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -332,6 +332,7 @@ int main(int argc, char **argv) regression1_test(); regression2_test(); regression3_test(); + iteration_test(); single_thread_tests(long_run); sleep(1); diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index e851313..217fb24 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -27,6 +27,7 @@ void item_kill_tree(struct radix_tree_root *root); void tag_check(void); void multiorder_checks(void); +void iteration_test(void); struct item * item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); -- cgit v0.10.2 From e0176a2f1e131294824d0e50e719cd12290cf06c Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 11 Oct 2016 13:51:24 -0700 Subject: radix-tree tests: properly initialize mutex The pthread_mutex_t in regression1.c wasn't being initialized properly. Link: http://lkml.kernel.org/r/20160815194237.25967-4-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c index 2d03a63..0d6813a 100644 --- a/tools/testing/radix-tree/regression1.c +++ b/tools/testing/radix-tree/regression1.c @@ -43,7 +43,7 @@ #include "regression.h" static RADIX_TREE(mt_tree, GFP_KERNEL); -static pthread_mutex_t mt_lock; +static pthread_mutex_t mt_lock = PTHREAD_MUTEX_INITIALIZER; struct page { pthread_mutex_t lock; -- cgit v0.10.2 From bf90e56e467ed5766722972d483e6711889ed1b0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 11 Oct 2016 13:51:27 -0700 Subject: lib: harden strncpy_from_user The strncpy_from_user() accessor is effectively a copy_from_user() specialised to copy strings, terminating early at a NUL byte if possible. In other respects it is identical, and can be used to copy an arbitrarily large buffer from userspace into the kernel. Conceptually, it exposes a similar attack surface. As with copy_from_user(), we check the destination range when the kernel is built with KASAN, but unlike copy_from_user() we do not check the destination buffer when using HARDENED_USERCOPY. As strncpy_from_user() calls get_user() in a loop, we must call check_object_size() explicitly. This patch adds this instrumentation to strncpy_from_user(), per the same rationale as with the regular copy_from_user(). In the absence of hardened usercopy this will have no impact as the instrumentation expands to an empty static inline function. Link: http://lkml.kernel.org/r/1472221903-31181-1-git-send-email-mark.rutland@arm.com Signed-off-by: Mark Rutland Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 9c5fe81..7e35fc4 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) long retval; kasan_check_write(dst, count); + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); -- cgit v0.10.2 From 1204c77f9b6ab8ba8cc6cfe00342f5e64a740cdf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 11 Oct 2016 13:51:30 -0700 Subject: include/linux/ctype.h: make isdigit() table lookupless Make isdigit into a simple range checking inline function: return '0' <= c && c <= '9'; This code is 1 branch, not 2 because any reasonable compiler can optimize this code into SUB+CMP, so the code while (isdigit((c = *s++))) ... remains 1 branch per iteration HOWEVER it suddenly doesn't do table lookup priming cacheline nobody cares about. Link: http://lkml.kernel.org/r/20160826190047.GA12536@p183.telecom.by Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/ctype.h b/include/linux/ctype.h index 653589e..f13e4ff 100644 --- a/include/linux/ctype.h +++ b/include/linux/ctype.h @@ -22,7 +22,10 @@ extern const unsigned char _ctype[]; #define isalnum(c) ((__ismask(c)&(_U|_L|_D)) != 0) #define isalpha(c) ((__ismask(c)&(_U|_L)) != 0) #define iscntrl(c) ((__ismask(c)&(_C)) != 0) -#define isdigit(c) ((__ismask(c)&(_D)) != 0) +static inline int isdigit(int c) +{ + return '0' <= c && c <= '9'; +} #define isgraph(c) ((__ismask(c)&(_P|_U|_L|_D)) != 0) #define islower(c) ((__ismask(c)&(_L)) != 0) #define isprint(c) ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0) -- cgit v0.10.2 From 8cfd56d4790423499d03f09f8584e0c6de494ec7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 11 Oct 2016 13:51:32 -0700 Subject: lib/kstrtox.c: smaller _parse_integer() Set "overflow" bit upon encountering it instead of postponing to the end of the conversion. Somehow gcc unwedges itself and generates better code: $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux _parse_integer 177 139 -38 Inspired by patch from Zhaoxiu Zeng. Link: http://lkml.kernel.org/r/20160826221920.GA1909@p183.telecom.by Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d8a5cf6..b8e2080 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long { unsigned long long res; unsigned int rv; - int overflow; res = 0; rv = 0; - overflow = 0; while (*s) { unsigned int val; @@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) - overflow = 1; + rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; - if (overflow) - rv |= KSTRTOX_OVERFLOW; return rv; } -- cgit v0.10.2 From 2d13e6ca429c0a6fbc82750acbece829facceec5 Mon Sep 17 00:00:00 2001 From: Noam Camus Date: Tue, 11 Oct 2016 13:51:35 -0700 Subject: lib/bitmap.c: enhance bitmap syntax Today there are platforms with many CPUs (up to 4K). Trying to boot only part of the CPUs may result in too long string. For example lets take NPS platform that is part of arch/arc. This platform have SMP system with 256 cores each with 16 HW threads (SMT machine) where HW thread appears as CPU to the kernel. In this example there is total of 4K CPUs. When one tries to boot only part of the HW threads from each core the string representing the map may be long... For example if for sake of performance we decided to boot only first half of HW threads of each core the map will look like: 0-7,16-23,32-39,...,4080-4087 This patch introduce new syntax to accommodate with such use case. I added an optional postfix to a range of CPUs which will choose according to given modulo the desired range of reminders i.e.: :sed_size/group_size For example, above map can be described in new syntax like this: 0-4095:8/16 Note that this patch is backward compatible with current syntax. [akpm@linux-foundation.org: rework documentation] Link: http://lkml.kernel.org/r/1473579629-4283-1-git-send-email-noamca@mellanox.com Signed-off-by: Noam Camus Cc: David Decotigny Cc: Ben Hutchings Cc: David S. Miller Cc: Pan Xinhui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 705fb91..a1489e1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -33,6 +33,37 @@ can also be entered as Double-quotes can be used to protect spaces in values, e.g.: param="spaces in here" +cpu lists: +---------- + +Some kernel parameters take a list of CPUs as a value, e.g. isolcpus, +nohz_full, irqaffinity, rcu_nocbs. The format of this list is: + + ,..., + +or + + - + (must be a positive range in ascending order) + +or a mixture + +,...,- + +Note that for the special case of a range one can split the range into equal +sized groups and for each group use some amount from the beginning of that +group: + + -cpu number>:/ + +For example one can add to the command line following parameter: + + isolcpus=1,2,10-20,100-2000:2/25 + +where the final item represents CPUs 100,101,125,126,150,151,... + + + This document may not be entirely up to date and comprehensive. The command "modinfo -p ${modulename}" shows a current list of all parameters of a loadable module. Loadable modules, after being loaded into the running kernel, also @@ -1789,13 +1820,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See Documentation/filesystems/nfs/nfsroot.txt. irqaffinity= [SMP] Set the default irq affinity mask - Format: - ,..., - or - - - (must be a positive range in ascending order) - or a mixture - ,...,- + The argument is a cpu list, as described above. irqfixup [HW] When an interrupt is not handled search all handlers @@ -1812,13 +1837,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: ,,, isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. - Format: - ,..., - or - - - (must be a positive range in ascending order) - or a mixture - ,...,- + The argument is a cpu list, as described above. This option can be used to specify one or more CPUs to isolate from the general SMP balancing and scheduling @@ -2680,6 +2699,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Default: on nohz_full= [KNL,BOOT] + The argument is a cpu list, as described above. In kernels built with CONFIG_NO_HZ_FULL=y, set the specified list of CPUs whose tick will be stopped whenever possible. The boot CPU will be forced outside @@ -3285,6 +3305,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. See Documentation/blockdev/ramdisk.txt. rcu_nocbs= [KNL] + The argument is a cpu list, as described above. + In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. Invocation of these CPUs' RCU callbacks will diff --git a/lib/bitmap.c b/lib/bitmap.c index eca8808..0b66f0e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. + * Optionally each range can be postfixed to denote that only parts of it + * should be set. The range will divided to groups of specific size. + * From each group will be used only defined amount of bits. + * Syntax: range:used_size/group_size + * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * * Returns 0 on success, -errno on invalid input strings. * Error values: @@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, unsigned long *maskp, int nmaskbits) { - unsigned a, b; + unsigned int a, b, old_a, old_b; + unsigned int group_size, used_size; int c, old_c, totaldigits, ndigits; const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range; + int at_start, in_range, in_partial_range; totaldigits = c = 0; + old_a = old_b = 0; + group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); do { at_start = 1; in_range = 0; + in_partial_range = 0; a = b = 0; ndigits = totaldigits; @@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, if ((totaldigits != ndigits) && isspace(old_c)) return -EINVAL; + if (c == '/') { + used_size = a; + at_start = 1; + in_range = 0; + a = b = 0; + continue; + } + + if (c == ':') { + old_a = a; + old_b = b; + at_start = 1; + in_range = 0; + in_partial_range = 1; + a = b = 0; + continue; + } + if (c == '-') { if (at_start || in_range) return -EINVAL; @@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, } if (ndigits == totaldigits) continue; + if (in_partial_range) { + group_size = a; + a = old_a; + b = old_b; + old_a = old_b = 0; + } /* if no digit is after '-', it's wrong*/ if (at_start && in_range) return -EINVAL; - if (!(a <= b)) + if (!(a <= b) || !(used_size <= group_size)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; while (a <= b) { - set_bit(a, maskp); + if (in_partial_range) { + static int pos_in_group = 1; + + if (pos_in_group <= used_size) + set_bit(a, maskp); + + if (a == b || ++pos_in_group > group_size) + pos_in_group = 1; + } else + set_bit(a, maskp); a++; } } while (buflen && c == ','); -- cgit v0.10.2 From 85b0ee18bbf82cb3e1880c718749149c6ed61058 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:44 -0700 Subject: checkpatch: see if modified files are marked obsolete in MAINTAINERS Use get_maintainer to check the status of individual files. If "obsolete", suggest leaving the files alone. Link: http://lkml.kernel.org/r/7ceaa510dc9d2df05ec4b456baed7bb1415550b3.1471889575.git.joe@perches.com Signed-off-by: Joe Perches Cc: SF Markus Elfring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 206a6b3..34b4455 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -704,6 +704,16 @@ sub seed_camelcase_file { } } +sub is_maintained_obsolete { + my ($filename) = @_; + + return 0 if (!(-e "$root/scripts/get_maintainer.pl")); + + my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback $filename 2>&1`; + + return $status =~ /obsolete/i; +} + my $camelcase_seeded = 0; sub seed_camelcase_includes { return if ($camelcase_seeded); @@ -2289,6 +2299,10 @@ sub process { } if ($found_file) { + if (is_maintained_obsolete($realfile)) { + WARN("OBSOLETE", + "$realfile is marked as 'obsolete' in the MAINTAINERS hierarchy. No unnecessary modifications please.\n"); + } if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) { $check = 1; } else { -- cgit v0.10.2 From f90774e1fd2700de4a6e0d62866d34a26c544bd0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:47 -0700 Subject: checkpatch: look for symbolic permissions and suggest octal instead S_ uses should be avoided where octal is more intelligible. Linus didst say: : It's *much* easier to parse and understand the octal numbers, while the : symbolic macro names are just random line noise and hard as hell to : understand. You really have to think about it. : : So we should rather go the other way: convert existing bad symbolic : permission bit macro use to just use the octal numbers. : : The symbolic names are good for the *other* bits (ie sticky bit, and the : inode mode _type_ numbers etc), but for the permission bits, the symbolic : names are just insane crap. Nobody sane should ever use them. Not in the : kernel, not in user space. (http://lkml.kernel.org/r/CA+55aFw5v23T-zvDZp-MmD_EYxF8WbafwwB59934FV7g21uMGQ@mail.gmail.com) Link: http://lkml.kernel.org/r/7232ef011d05a92f4caa86a5e9830d87966a2eaf.1470180926.git.joe@perches.com Signed-off-by: Joe Perches Cc: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 34b4455..1c82b01 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -541,6 +541,32 @@ our $mode_perms_world_writable = qr{ 0[0-7][0-7][2367] }x; +our %mode_permission_string_types = ( + "S_IRWXU" => 0700, + "S_IRUSR" => 0400, + "S_IWUSR" => 0200, + "S_IXUSR" => 0100, + "S_IRWXG" => 0070, + "S_IRGRP" => 0040, + "S_IWGRP" => 0020, + "S_IXGRP" => 0010, + "S_IRWXO" => 0007, + "S_IROTH" => 0004, + "S_IWOTH" => 0002, + "S_IXOTH" => 0001, + "S_IRWXUGO" => 0777, + "S_IRUGO" => 0444, + "S_IWUGO" => 0222, + "S_IXUGO" => 0111, +); + +#Create a search pattern for all these strings to speed up a loop below +our $mode_perms_string_search = ""; +foreach my $entry (keys %mode_permission_string_types) { + $mode_perms_string_search .= '|' if ($mode_perms_string_search ne ""); + $mode_perms_string_search .= $entry; +} + our $allowed_asm_includes = qr{(?x: irq| memory| @@ -6003,20 +6029,31 @@ sub process { $arg_pos--; $skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}"; } - my $test = "\\b$func\\s*\\(${skip_args}([\\d]+)\\s*[,\\)]"; + my $test = "\\b$func\\s*\\(${skip_args}($FuncArg(?:\\|\\s*$FuncArg)*)\\s*[,\\)]"; if ($line =~ /$test/) { my $val = $1; $val = $6 if ($skip_args ne ""); - - if ($val !~ /^0$/ && - (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || - length($val) ne 4)) { + if (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || + ($val =~ /^$Octal$/ && length($val) ne 4)) { ERROR("NON_OCTAL_PERMISSIONS", "Use 4 digit octal (0777) not decimal permissions\n" . $herecurr); - } elsif ($val =~ /^$Octal$/ && (oct($val) & 02)) { + } + if ($val =~ /^$Octal$/ && (oct($val) & 02)) { ERROR("EXPORTED_WORLD_WRITABLE", "Exporting writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } + if ($val =~ /\b$mode_perms_string_search\b/) { + my $to = 0; + while ($val =~ /\b($mode_perms_string_search)\b(?:\s*\|\s*)?\s*/g) { + $to |= $mode_permission_string_types{$1}; + } + my $new = sprintf("%04o", $to); + if (WARN("SYMBOLIC_PERMS", + "Symbolic permissions are not preferred. Consider using octal permissions $new.\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\Q$val\E/$new/; + } + } } } } -- cgit v0.10.2 From 08eb9b8016b18f53fa8900f3d1c9359f9edeb724 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:50 -0700 Subject: checkpatch: test multiple line block comment alignment Warn when block comments are not aligned on the * /* * block comment, no warning */ /* * block comment, emit warning */ Link: http://lkml.kernel.org/r/edb57bd330adfe024b95ec2a807d4aa7f0c8b112.1472261299.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1c82b01..894f8c6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2979,6 +2979,25 @@ sub process { "Block comments use a trailing */ on a separate line\n" . $herecurr); } +# Block comment * alignment + if ($prevline =~ /$;[ \t]*$/ && #ends in comment + (($prevrawline =~ /^\+.*?\/\*/ && #starting /* + $prevrawline !~ /\*\/[ \t]*$/) || #no trailing */ + $prevrawline =~ /^\+[ \t]*\*/) && #starting * + $rawline =~ /^\+[ \t]*\*/) { #rawline * + $prevrawline =~ m@^\+([ \t]*/?)\*@; + my $oldindent = expand_tabs($1); + $rawline =~ m@^\+([ \t]*)\*@; + my $newindent = $1; + my $test_comment = '^\\+' . "$;" x (length($newindent) + 1); + $newindent = expand_tabs($newindent); + if ($line =~ /$test_comment/ && + length($oldindent) ne length($newindent)) { + WARN("BLOCK_COMMENT_STYLE", + "Block comments should align the * on each line\n" . $hereprev); + } + } + # check for missing blank lines after struct/union declarations # with exceptions for various attributes and macros if ($prevline =~ /^[\+ ]};?\s*$/ && -- cgit v0.10.2 From f333195d41e15e7b105ca270c7662204a1a9d0f5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:53 -0700 Subject: checkpatch: don't test for prefer ether_addr_ < sigh > Comment these tests out. These are just too enticing to people that don't verify that both source and dest addresses really must be __aligned(2). It helps make Dan Carpenter happy too. Link: http://lkml.kernel.org/r/dc32ec66d24647f4cdf824c8dfbbc59aa7ce7b7d.1472665676.git.joe@perches.com Signed-off-by: Joe Perches Cc: Dan Carpenter Cc: Greg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 894f8c6..37d8b91 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5554,46 +5554,46 @@ sub process { } # Check for memcpy(foo, bar, ETH_ALEN) that could be ether_addr_copy(foo, bar) - if ($^V && $^V ge 5.10.0 && - defined $stat && - $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { - if (WARN("PREFER_ETHER_ADDR_COPY", - "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . "$here\n$stat\n") && - $fix) { - $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; - } - } +# if ($^V && $^V ge 5.10.0 && +# defined $stat && +# $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { +# if (WARN("PREFER_ETHER_ADDR_COPY", +# "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . "$here\n$stat\n") && +# $fix) { +# $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; +# } +# } # Check for memcmp(foo, bar, ETH_ALEN) that could be ether_addr_equal*(foo, bar) - if ($^V && $^V ge 5.10.0 && - defined $stat && - $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { - WARN("PREFER_ETHER_ADDR_EQUAL", - "Prefer ether_addr_equal() or ether_addr_equal_unaligned() over memcmp()\n" . "$here\n$stat\n") - } +# if ($^V && $^V ge 5.10.0 && +# defined $stat && +# $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { +# WARN("PREFER_ETHER_ADDR_EQUAL", +# "Prefer ether_addr_equal() or ether_addr_equal_unaligned() over memcmp()\n" . "$here\n$stat\n") +# } # check for memset(foo, 0x0, ETH_ALEN) that could be eth_zero_addr # check for memset(foo, 0xFF, ETH_ALEN) that could be eth_broadcast_addr - if ($^V && $^V ge 5.10.0 && - defined $stat && - $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { - - my $ms_val = $7; - - if ($ms_val =~ /^(?:0x|)0+$/i) { - if (WARN("PREFER_ETH_ZERO_ADDR", - "Prefer eth_zero_addr over memset()\n" . "$here\n$stat\n") && - $fix) { - $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_zero_addr($2)/; - } - } elsif ($ms_val =~ /^(?:0xff|255)$/i) { - if (WARN("PREFER_ETH_BROADCAST_ADDR", - "Prefer eth_broadcast_addr() over memset()\n" . "$here\n$stat\n") && - $fix) { - $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_broadcast_addr($2)/; - } - } - } +# if ($^V && $^V ge 5.10.0 && +# defined $stat && +# $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { +# +# my $ms_val = $7; +# +# if ($ms_val =~ /^(?:0x|)0+$/i) { +# if (WARN("PREFER_ETH_ZERO_ADDR", +# "Prefer eth_zero_addr over memset()\n" . "$here\n$stat\n") && +# $fix) { +# $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_zero_addr($2)/; +# } +# } elsif ($ms_val =~ /^(?:0xff|255)$/i) { +# if (WARN("PREFER_ETH_BROADCAST_ADDR", +# "Prefer eth_broadcast_addr() over memset()\n" . "$here\n$stat\n") && +# $fix) { +# $fixed[$fixlinenr] =~ s/\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*,\s*ETH_ALEN\s*\)/eth_broadcast_addr($2)/; +# } +# } +# } # typecasts on min/max could be min_t/max_t if ($^V && $^V ge 5.10.0 && -- cgit v0.10.2 From bf1fa1dae68e1b58b7b7fd61bde654d37da1de57 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:56 -0700 Subject: checkpatch: externalize the structs that should be const Make it easier to add new structs that should be const. Link: http://lkml.kernel.org/r/e5a8da43e7c11525bafbda1ca69a8323614dd942.1472664220.git.joe@perches.com Signed-off-by: Joe Perches Cc: Julia Lawall Cc: Kees Cook Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 37d8b91..4ecb66c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -54,6 +54,7 @@ my $min_conf_desc_length = 4; my $spelling_file = "$D/spelling.txt"; my $codespell = 0; my $codespellfile = "/usr/share/codespell/dictionary.txt"; +my $conststructsfile = "$D/const_structs.checkpatch"; my $color = 1; my $allow_c99_comments = 1; @@ -624,6 +625,29 @@ if ($codespell) { $misspellings = join("|", sort keys %spelling_fix) if keys %spelling_fix; +my $const_structs = ""; +if (open(my $conststructs, '<', $conststructsfile)) { + while (<$conststructs>) { + my $line = $_; + + $line =~ s/\s*\n?$//g; + $line =~ s/^\s*//g; + + next if ($line =~ m/^\s*#/); + next if ($line =~ m/^\s*$/); + if ($line =~ /\s/) { + print("$conststructsfile: '$line' invalid - ignored\n"); + next; + } + + $const_structs .= '|' if ($const_structs ne ""); + $const_structs .= $line; + } + close($conststructsfile); +} else { + warn "No structs that should be const will be found - file '$conststructsfile': $!\n"; +} + sub build_types { my $mods = "(?x: \n" . join("|\n ", (@modifierList, @modifierListFile)) . "\n)"; my $all = "(?x: \n" . join("|\n ", (@typeList, @typeListFile)) . "\n)"; @@ -5912,46 +5936,6 @@ sub process { } # check for various structs that are normally const (ops, kgdb, device_tree) - my $const_structs = qr{ - acpi_dock_ops| - address_space_operations| - backlight_ops| - block_device_operations| - dentry_operations| - dev_pm_ops| - dma_map_ops| - extent_io_ops| - file_lock_operations| - file_operations| - hv_ops| - ide_dma_ops| - intel_dvo_dev_ops| - item_operations| - iwl_ops| - kgdb_arch| - kgdb_io| - kset_uevent_ops| - lock_manager_operations| - microcode_ops| - mtrr_ops| - neigh_ops| - nlmsvc_binding| - of_device_id| - pci_raw_ops| - pipe_buf_operations| - platform_hibernation_ops| - platform_suspend_ops| - proto_ops| - rpc_pipe_ops| - seq_operations| - snd_ac97_build_ops| - soc_pcmcia_socket_ops| - stacktrace_ops| - sysfs_ops| - tty_operations| - uart_ops| - usb_mon_operations| - wd_ops}x; if ($line !~ /\bconst\b/ && $line =~ /\bstruct\s+($const_structs)\b/) { WARN("CONST_STRUCT", diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch new file mode 100644 index 0000000..1b54425 --- /dev/null +++ b/scripts/const_structs.checkpatch @@ -0,0 +1,39 @@ +acpi_dock_ops +address_space_operations +backlight_ops +block_device_operations +dentry_operations +dev_pm_ops +dma_map_ops +extent_io_ops +file_lock_operations +file_operations +hv_ops +ide_dma_ops +intel_dvo_dev_ops +item_operations +iwl_ops +kgdb_arch +kgdb_io +kset_uevent_ops +lock_manager_operations +microcode_ops +mtrr_ops +neigh_ops +nlmsvc_binding +of_device_id +pci_raw_ops +pipe_buf_operations +platform_hibernation_ops +platform_suspend_ops +proto_ops +rpc_pipe_ops +seq_operations +snd_ac97_build_ops +soc_pcmcia_socket_ops +stacktrace_ops +sysfs_ops +tty_operations +uart_ops +usb_mon_operations +wd_ops -- cgit v0.10.2 From 15c03cfeabff447abec9e05b5830922099b89943 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:51:59 -0700 Subject: const_structs.checkpatch: add frequently used from Julia Lawall's list Using const is generally a good idea. Julia Lawall has created a list of always const and almost always const structs in the kernel sources. Link: https://lkml.org/lkml/2016/8/28/95 Add the most frequently used (> 50 cases) that are almost always or always const. Link: http://lkml.kernel.org/r/1e16020f8027654db0095bbfbcc11da51025365c.1472664220.git.joe@perches.com Signed-off-by: Joe Perches Acked-by: Kees Cook Cc: Julia Lawall Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch index 1b54425..ac5f1267 100644 --- a/scripts/const_structs.checkpatch +++ b/scripts/const_structs.checkpatch @@ -2,38 +2,63 @@ acpi_dock_ops address_space_operations backlight_ops block_device_operations +clk_ops +comedi_lrange +component_ops dentry_operations dev_pm_ops dma_map_ops +driver_info +drm_connector_funcs +drm_encoder_funcs +drm_encoder_helper_funcs +ethtool_ops extent_io_ops file_lock_operations file_operations hv_ops ide_dma_ops +ide_port_ops +inode_operations intel_dvo_dev_ops +irq_domain_ops item_operations +iwl_cfg iwl_ops kgdb_arch kgdb_io kset_uevent_ops lock_manager_operations +machine_desc microcode_ops +mlxsw_reg_info mtrr_ops neigh_ops +net_device_ops nlmsvc_binding +nvkm_device_chip of_device_id pci_raw_ops pipe_buf_operations platform_hibernation_ops platform_suspend_ops proto_ops +regmap_access_table rpc_pipe_ops +rtc_class_ops +sd_desc seq_operations +sirfsoc_padmux snd_ac97_build_ops +snd_soc_component_driver soc_pcmcia_socket_ops stacktrace_ops sysfs_ops tty_operations uart_ops usb_mon_operations +v4l2_ctrl_ops +v4l2_ioctl_ops +vm_operations_struct +wacom_features wd_ops -- cgit v0.10.2 From 0616efa45af493a259d25c1716102173f10bd789 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:02 -0700 Subject: checkpatch: speed up checking for filenames in sections marked obsolete Adding -f to the get_maintainer.pl invocation means git isn't invoked by get_maintainer.pl for known filenames. This reduces the overall time to run checkpatch. Link: http://lkml.kernel.org/r/22991e3a295aeb399b43af0478b6e5809106ccee.1472684066.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4ecb66c..e44dc0c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -759,7 +759,7 @@ sub is_maintained_obsolete { return 0 if (!(-e "$root/scripts/get_maintainer.pl")); - my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback $filename 2>&1`; + my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`; return $status =~ /obsolete/i; } -- cgit v0.10.2 From af207524a49c3d70af0a9196072ab3b6d690fcf7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:05 -0700 Subject: checkpatch: improve the block comment * alignment test An "uninitialized value" is emitted when a block comment starts on the same line as a statement. Fix this and make the test use a little fewer cpu cycles too. Link: http://lkml.kernel.org/r/3c9993320c2182d37f53ac540878cfef59c3f62d.1473365956.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Charlemagne Lasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e44dc0c..3fa2b2e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3005,18 +3005,23 @@ sub process { # Block comment * alignment if ($prevline =~ /$;[ \t]*$/ && #ends in comment - (($prevrawline =~ /^\+.*?\/\*/ && #starting /* + $line =~ /^\+[ \t]*$;/ && #leading comment + $rawline =~ /^\+[ \t]*\*/ && #leading * + (($prevrawline =~ /^\+.*?\/\*/ && #leading /* $prevrawline !~ /\*\/[ \t]*$/) || #no trailing */ - $prevrawline =~ /^\+[ \t]*\*/) && #starting * - $rawline =~ /^\+[ \t]*\*/) { #rawline * + $prevrawline =~ /^\+[ \t]*\*/)) { #leading * + my $oldindent; $prevrawline =~ m@^\+([ \t]*/?)\*@; - my $oldindent = expand_tabs($1); + if (defined($1)) { + $oldindent = expand_tabs($1); + } else { + $prevrawline =~ m@^\+(.*/?)\*@; + $oldindent = expand_tabs($1); + } $rawline =~ m@^\+([ \t]*)\*@; my $newindent = $1; - my $test_comment = '^\\+' . "$;" x (length($newindent) + 1); $newindent = expand_tabs($newindent); - if ($line =~ /$test_comment/ && - length($oldindent) ne length($newindent)) { + if (length($oldindent) ne length($newindent)) { WARN("BLOCK_COMMENT_STYLE", "Block comments should align the * on each line\n" . $hereprev); } -- cgit v0.10.2 From f59b64bffe163431b4ec61f3b00c1aeb846948f3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:08 -0700 Subject: checkpatch: add --strict test for macro argument reuse If a macro argument is used multiple times in the macro definition, the macro argument may have an unexpected side-effect. Add a test (MACRO_ARG_REUSE) for that condition which is only emitted with command-line option --strict. Link: http://lkml.kernel.org/r/b6d67a87cafcafd15499e91780dc63b15dec0aa0.1473744906.git.joe@perches.com Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Julia Lawall Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3fa2b2e..f135f8e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4753,7 +4753,17 @@ sub process { $has_flow_statement = 1 if ($ctx =~ /\b(goto|return)\b/); $has_arg_concat = 1 if ($ctx =~ /\#\#/ && $ctx !~ /\#\#\s*(?:__VA_ARGS__|args)\b/); - $dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//; + $dstat =~ s/^.\s*\#\s*define\s+$Ident(\([^\)]*\))?\s*//; + my $define_args = $1; + my $define_stmt = $dstat; + my @def_args = (); + + if (defined $define_args && $define_args ne "") { + $define_args = substr($define_args, 1, length($define_args) - 2); + $define_args =~ s/\s*//g; + @def_args = split(",", $define_args); + } + $dstat =~ s/$;//g; $dstat =~ s/\\\n.//g; $dstat =~ s/^\s*//s; @@ -4789,6 +4799,15 @@ sub process { ^\[ }x; #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; + + $ctx =~ s/\n*$//; + my $herectx = $here . "\n"; + my $stmt_cnt = statement_rawlines($ctx); + + for (my $n = 0; $n < $stmt_cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + if ($dstat ne '' && $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); @@ -4804,13 +4823,6 @@ sub process { $dstat !~ /^\(\{/ && # ({... $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/) { - $ctx =~ s/\n*$//; - my $herectx = $here . "\n"; - my $cnt = statement_rawlines($ctx); - - for (my $n = 0; $n < $cnt; $n++) { - $herectx .= raw_line($linenr, $n) . "\n"; - } if ($dstat =~ /;/) { ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", @@ -4819,6 +4831,21 @@ sub process { ERROR("COMPLEX_MACRO", "Macros with complex values should be enclosed in parentheses\n" . "$herectx"); } + + } +# check if any macro arguments are reused (ignore '...' and 'type') + foreach my $arg (@def_args) { + next if ($arg =~ /\.\.\./); + next if ($arg =~ /^type$/); + my $tmp = $define_stmt; + $tmp =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; + $tmp =~ s/\#\s*$arg\b//g; + $tmp =~ s/\b$arg\s*\#\#//g; + my $use_cnt = $tmp =~ s/\b$arg\b//g; + if ($use_cnt > 1) { + CHK("MACRO_ARG_REUSE", + "Macro argument reuse '$arg' - possible side-effects?\n" . "$herectx"); + } } # check for macros with flow control, but without ## concatenation -- cgit v0.10.2 From 9192d41a3f0b98844860f1de5b82f729bbe07c61 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:11 -0700 Subject: checkpatch: add --strict test for precedence challenged macro arguments Add a test for macro arguents that have a non-comma leading or trailing operator where the argument isn't parenthesized to avoid possible precedence issues. Link: http://lkml.kernel.org/r/47715508972f8d786f435e583ff881dbeee3a114.1473745855.git.joe@perches.com Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Julia Lawall Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f135f8e..ea1a7ad 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4836,7 +4836,7 @@ sub process { # check if any macro arguments are reused (ignore '...' and 'type') foreach my $arg (@def_args) { next if ($arg =~ /\.\.\./); - next if ($arg =~ /^type$/); + next if ($arg =~ /^type$/i); my $tmp = $define_stmt; $tmp =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; $tmp =~ s/\#\s*$arg\b//g; @@ -4845,6 +4845,13 @@ sub process { if ($use_cnt > 1) { CHK("MACRO_ARG_REUSE", "Macro argument reuse '$arg' - possible side-effects?\n" . "$herectx"); + } +# check if any macro arguments may have other precedence issues + if ($define_stmt =~ m/($Operators)?\s*\b$arg\b\s*($Operators)?/m && + ((defined($1) && $1 ne ',') || + (defined($2) && $2 ne ','))) { + CHK("MACRO_ARG_PRECEDENCE", + "Macro argument '$arg' may be better as '($arg)' to avoid precedence issues\n" . "$herectx"); } } -- cgit v0.10.2 From 5207649b7b1d2b1e1f8c97df91debfaa8de37669 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:14 -0700 Subject: checkpatch: improve MACRO_ARG_PRECEDENCE test It is possible for a multiple line macro definition to have a false positive report when an argument is used on a line after a continuation \. This line might have a leading '+' as the initial character that could be confused by checkpatch as an operator. Avoid the leading character on multiple line macro definitions. Link: http://lkml.kernel.org/r/60229d13399f9b6509db5a32e30d4c16951a60cd.1473836073.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ea1a7ad..0ef3d83 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4833,13 +4833,31 @@ sub process { } } + + # Make $define_stmt single line, comment-free, etc + my @stmt_array = split('\n', $define_stmt); + my $first = 1; + $define_stmt = ""; + foreach my $l (@stmt_array) { + $l =~ s/\\$//; + if ($first) { + $define_stmt = $l; + $first = 0; + } elsif ($l =~ /^[\+ ]/) { + $define_stmt .= substr($l, 1); + } + } + $define_stmt =~ s/$;//g; + $define_stmt =~ s/\s+/ /g; + $define_stmt = trim($define_stmt); + # check if any macro arguments are reused (ignore '...' and 'type') foreach my $arg (@def_args) { next if ($arg =~ /\.\.\./); next if ($arg =~ /^type$/i); my $tmp = $define_stmt; $tmp =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; - $tmp =~ s/\#\s*$arg\b//g; + $tmp =~ s/\#+\s*$arg\b//g; $tmp =~ s/\b$arg\s*\#\#//g; my $use_cnt = $tmp =~ s/\b$arg\b//g; if ($use_cnt > 1) { -- cgit v0.10.2 From ca0d8929e75ab1f860f61208d46955f280a1b05e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:16 -0700 Subject: checkpatch: add warning for unnamed function definition arguments Function definitions without identifiers like int foo(int) are not preferred. Emit a warning when they occur. Link: http://lkml.kernel.org/r/94fe6378504745991b650f48fc92bb4648f25706.1474925354.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0ef3d83..3373c65 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5794,6 +5794,19 @@ sub process { "externs should be avoided in .c files\n" . $herecurr); } + if ($realfile =~ /\.[ch]$/ && defined $stat && + $stat =~ /^.\s*(?:extern\s+)?$Type\s*$Ident\s*\(\s*([^{]+)\s*\)\s*;/s && + $1 ne "void") { + my $args = trim($1); + while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) { + my $arg = trim($1); + if ($arg =~ /^$Type$/ && $arg !~ /enum\s+$Ident$/) { + WARN("FUNCTION_ARGUMENTS", + "function definition argument '$arg' should also have an identifier name\n" . $herecurr); + } + } + } + # checks for new __setup's if ($rawline =~ /\b__setup\("([^"]*)"/) { my $name = $1; -- cgit v0.10.2 From 459cf0ae5d6600592a539bc861c84e14ed1a5bcb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Oct 2016 13:52:19 -0700 Subject: checkpatch: improve the octal permissions tests The function calls with octal permissions commonly span multiple lines. The current test is line oriented and fails to find some matches. Make the test use the $stat variable instead of the $line variable to span multiple lines. Also add a few functions to the known functions with permissions list. Move the SYMBOLIC_PERMS test to a separate section to find all the S_ permissions in any form not just those that have specific function names. This can now find and fix permissions uses like: .mode = S_ | S_; Link: http://lkml.kernel.org/r/b51bab60530912aae4ac420119d465c5b206f19f.1475030406.git.joe@perches.com Signed-off-by: Joe Perches Tested-by: Ramiro Oliveira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3373c65..a8368d1 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -524,7 +524,11 @@ our @mode_permission_funcs = ( ["module_param_array_named", 5], ["debugfs_create_(?:file|u8|u16|u32|u64|x8|x16|x32|x64|size_t|atomic_t|bool|blob|regset32|u32_array)", 2], ["proc_create(?:_data|)", 2], - ["(?:CLASS|DEVICE|SENSOR)_ATTR", 2], + ["(?:CLASS|DEVICE|SENSOR|SENSOR_DEVICE|IIO_DEVICE)_ATTR", 2], + ["IIO_DEV_ATTR_[A-Z_]+", 1], + ["SENSOR_(?:DEVICE_|)ATTR_2", 2], + ["SENSOR_TEMPLATE(?:_2|)", 3], + ["__ATTR", 2], ); #Create a search pattern for all these functions to speed up a loop below @@ -6092,45 +6096,69 @@ sub process { # Mode permission misuses where it seems decimal should be octal # This uses a shortcut match to avoid unnecessary uses of a slow foreach loop if ($^V && $^V ge 5.10.0 && + defined $stat && $line =~ /$mode_perms_search/) { foreach my $entry (@mode_permission_funcs) { my $func = $entry->[0]; my $arg_pos = $entry->[1]; + my $lc = $stat =~ tr@\n@@; + $lc = $lc + $linenr; + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + my $skip_args = ""; if ($arg_pos > 1) { $arg_pos--; $skip_args = "(?:\\s*$FuncArg\\s*,\\s*){$arg_pos,$arg_pos}"; } my $test = "\\b$func\\s*\\(${skip_args}($FuncArg(?:\\|\\s*$FuncArg)*)\\s*[,\\)]"; - if ($line =~ /$test/) { + if ($stat =~ /$test/) { my $val = $1; $val = $6 if ($skip_args ne ""); if (($val =~ /^$Int$/ && $val !~ /^$Octal$/) || ($val =~ /^$Octal$/ && length($val) ne 4)) { ERROR("NON_OCTAL_PERMISSIONS", - "Use 4 digit octal (0777) not decimal permissions\n" . $herecurr); + "Use 4 digit octal (0777) not decimal permissions\n" . "$here\n" . $stat_real); } if ($val =~ /^$Octal$/ && (oct($val) & 02)) { ERROR("EXPORTED_WORLD_WRITABLE", - "Exporting writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); - } - if ($val =~ /\b$mode_perms_string_search\b/) { - my $to = 0; - while ($val =~ /\b($mode_perms_string_search)\b(?:\s*\|\s*)?\s*/g) { - $to |= $mode_permission_string_types{$1}; - } - my $new = sprintf("%04o", $to); - if (WARN("SYMBOLIC_PERMS", - "Symbolic permissions are not preferred. Consider using octal permissions $new.\n" . $herecurr) && - $fix) { - $fixed[$fixlinenr] =~ s/\Q$val\E/$new/; - } + "Exporting writable files is usually an error. Consider more restrictive permissions.\n" . "$here\n" . $stat_real); } } } } +# check for uses of S_ that could be octal for readability + if ($line =~ /\b$mode_perms_string_search\b/) { + my $val = ""; + my $oval = ""; + my $to = 0; + my $curpos = 0; + my $lastpos = 0; + while ($line =~ /\b(($mode_perms_string_search)\b(?:\s*\|\s*)?\s*)/g) { + $curpos = pos($line); + my $match = $2; + my $omatch = $1; + last if ($lastpos > 0 && ($curpos - length($omatch) != $lastpos)); + $lastpos = $curpos; + $to |= $mode_permission_string_types{$match}; + $val .= '\s*\|\s*' if ($val ne ""); + $val .= $match; + $oval .= $omatch; + } + $oval =~ s/^\s*\|\s*//; + $oval =~ s/\s*\|\s*$//; + my $octal = sprintf("%04o", $to); + if (WARN("SYMBOLIC_PERMS", + "Symbolic permissions '$oval' are not preferred. Consider using octal permissions '$octal'.\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/$val/$octal/; + } + } + # validate content of MODULE_LICENSE against list from include/linux/module.h if ($line =~ /\bMODULE_LICENSE\s*\(\s*($String)\s*\)/) { my $extracted_string = get_quoted_string($line, $rawline); -- cgit v0.10.2 From bfd45be0b83e8f711f3abc892850d6047972d127 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Oct 2016 13:52:22 -0700 Subject: kprobes: include instead of asm-generic headers are generic implementations for architecture specific code and should not be included by common code. Thus use the asm/ version of sections.h to get at the linker sections. Link: http://lkml.kernel.org/r/1473602302-6208-1-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b..d630954 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -49,7 +49,7 @@ #include #include -#include +#include #include #include #include -- cgit v0.10.2 From e662145f5c1276d35e8955b3df7a68da306ee498 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:25 -0700 Subject: autofs: fix typos in Documentation/filesystems/autofs4.txt plus minor whitespace fixes. Link: http://lkml.kernel.org/r/20160812024734.12352.17122.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index 39d02e1..8fac3fe 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -203,9 +203,9 @@ initiated or is being considered, otherwise it returns 0. Mountpoint expiry ----------------- -The VFS has a mechansim for automatically expiring unused mounts, +The VFS has a mechanism for automatically expiring unused mounts, much as it can expire any unused dentry information from the dcache. -This is guided by the MNT_SHRINKABLE flag. This only applies to +This is guided by the MNT_SHRINKABLE flag. This only applies to mounts that were created by `d_automount()` returning a filesystem to be mounted. As autofs doesn't return such a filesystem but leaves the mounting to the automount daemon, it must involve the automount daemon @@ -298,7 +298,7 @@ remove directories and symlinks using normal filesystem operations. autofs knows whether a process requesting some operation is the daemon or not based on its process-group id number (see getpgid(1)). -When an autofs filesystem it mounted the pgid of the mounting +When an autofs filesystem is mounted the pgid of the mounting processes is recorded unless the "pgrp=" option is given, in which case that number is recorded instead. Any request arriving from a process in that process group is considered to come from the daemon. @@ -450,7 +450,7 @@ Commands are: numbers for existing filesystems can be found in `/proc/self/mountinfo`. - **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`. -- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in +- **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in catatonic mode, this can provide the write end of a new pipe in `arg1` to re-establish communication with a daemon. The process group of the calling process is used to identify the -- cgit v0.10.2 From 4a44c1859ffd9926ba4d47d1b538d119065ec2f5 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:28 -0700 Subject: autofs: drop unnecessary extern in autofs_i.h autofs4_kill_sb() doesn't need to be declared as extern, and no other functions in .h are explicitly declared as extern. Link: http://lkml.kernel.org/r/20160812024739.12352.99354.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a439548..eb87055 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -271,4 +271,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -extern void autofs4_kill_sb(struct super_block *); +void autofs4_kill_sb(struct super_block *); -- cgit v0.10.2 From 749800ef537848b9569da9bb1463f1de4b96c0a4 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:31 -0700 Subject: autofs: test autofs versions first on sb initialization This patch does what the below comment says. It could be and it's considered better to do this first before various functions get called during initialization. /* Couldn't this be tested earlier? */ Link: http://lkml.kernel.org/r/20160812024744.12352.43075.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index ca9cbd6..d76573f 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -274,6 +274,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) goto fail_dput; } + /* Test versions first */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + pr_err("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + if (pgrp_set) { sbi->oz_pgrp = find_get_pid(pgrp); if (!sbi->oz_pgrp) { @@ -291,23 +308,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) root_inode->i_fop = &autofs4_root_operations; root_inode->i_op = &autofs4_dir_inode_operations; - /* Couldn't this be tested earlier? */ - if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || - sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; - } - - /* Establish highest kernel protocol version */ - if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) - sbi->version = AUTOFS_MAX_PROTO_VERSION; - else - sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); -- cgit v0.10.2 From 1973a122697168c0585ffce9d4bd46745ddaa6f6 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 11 Oct 2016 13:52:33 -0700 Subject: autofs: fix autofs4_fill_super() error exit handling Somewhere along the line the error handling gotos have become incorrect. Link: http://lkml.kernel.org/r/20160812024749.12352.15100.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Tomohiro Kusumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d76573f..3f486c1 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -313,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (!pipe) { pr_err("could not open pipe file descriptor\n"); - goto fail_dput; + goto fail_put_pid; } ret = autofs_prepare_pipe(pipe); if (ret < 0) @@ -334,14 +334,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) fail_fput: pr_err("pipe file descriptor does not contain proper ops\n"); fput(pipe); - /* fall through */ +fail_put_pid: + put_pid(sbi->oz_pgrp); fail_dput: dput(root); goto fail_free; fail_ino: kfree(ino); fail_free: - put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; return ret; -- cgit v0.10.2 From 97537b35b662855e19f2812d3d716bde6f237cfa Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:36 -0700 Subject: autofs: add WARN_ON(1) for non dir/link inode case It's invalid if the given mode is neither dir nor link, so warn on else case. Link: http://lkml.kernel.org/r/20160812024754.12352.8536.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 3f486c1..a61521e 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -368,7 +368,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &autofs4_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &autofs4_symlink_inode_operations; - } + } else + WARN_ON(1); return inode; } -- cgit v0.10.2 From 1574fa7beb9f71a60b5b8f2532ca69d3153ded79 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 11 Oct 2016 13:52:39 -0700 Subject: autofs: remove ino free in autofs4_dir_symlink() The inode allocation failure case in autofs4_dir_symlink() frees the autofs dentry info of the dentry without setting ->d_fsdata to NULL. That could lead to a double free so just get rid of the free and leave it to ->d_release(). Link: http://lkml.kernel.org/r/20160812024759.12352.10653.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Tomohiro Kusumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 623510e..2eebeae 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -577,8 +577,6 @@ static int autofs4_dir_symlink(struct inode *dir, inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); - if (!dentry->d_fsdata) - kfree(ino); return -ENOMEM; } inode->i_private = cp; -- cgit v0.10.2 From ca552599bfb9888ba9371eb4a9659a02b86529e6 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:42 -0700 Subject: autofs: use autofs4_free_ino() to kfree dentry data kfree dentry data allocated by autofs4_new_ino() with autofs4_free_ino() instead of raw kfree. (since we have the interface to free autofs_info*) This patch was modified to remove the need to set the dentry info field to NULL dew to a change in the previous patch. Link: http://lkml.kernel.org/r/20160812024805.12352.43650.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index a61521e..438b5bf 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -340,7 +340,7 @@ fail_dput: dput(root); goto fail_free; fail_ino: - kfree(ino); + autofs4_free_ino(ino); fail_free: kfree(sbi); s->s_fs_info = NULL; -- cgit v0.10.2 From eea618e6d5a819f2a739d441efccc444e57e68c2 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:45 -0700 Subject: autofs: remove obsolete sb fields These two were left from commit aa55ddf340c9 ("autofs4: remove unused ioctls") which removed unused ioctls. Link: http://lkml.kernel.org/r/20160812024810.12352.96377.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index eb87055..73e3d38 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -111,8 +111,6 @@ struct autofs_sb_info { int max_proto; unsigned long exp_timeout; unsigned int type; - int reghost_enabled; - int needs_reghost; struct super_block *sb; struct mutex wq_mutex; struct mutex pipe_mutex; -- cgit v0.10.2 From 41a4497a4f786a83292ac40a0a39f75c6c4de722 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:48 -0700 Subject: autofs: don't fail to free_dev_ioctl(param) Returning -ENOTTY here fails to free dynamically allocated param. Link: http://lkml.kernel.org/r/20160812024815.12352.69153.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index c7fcc74..d47b35a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -662,7 +662,8 @@ static int _autofs_dev_ioctl(unsigned int command, fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); - return -ENOTTY; + err = -ENOTTY; + goto out; } fp = NULL; -- cgit v0.10.2 From 72063e01eda7e7562702bbf790380104bf704379 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:51 -0700 Subject: autofs: remove AUTOFS_DEVID_LEN This macro was never used by neither kernel nor userspace, and also doesn't represent "devid length" in bytes. (unless it was added to mean something else). Link: http://lkml.kernel.org/r/20160812024820.12352.21210.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index 7caaf29..bf82e3a 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -18,8 +18,6 @@ #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 #define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 -#define AUTOFS_DEVID_LEN 16 - #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) /* -- cgit v0.10.2 From d873284103dacbe90ace2f3de20dff02fafcfef0 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:53 -0700 Subject: autofs: fix Documentation regarding devid on ioctl The explanation on how ioctl handles devid seems incorrect. Userspace who calls this ioctl has no input regarding devid, and ioctl implementation retrieves devid via superblock. Link: http://lkml.kernel.org/r/20160812024825.12352.13486.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index aff2211..540d9a7 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt @@ -323,9 +323,8 @@ mount on the given path dentry. The call requires an initialized struct autofs_dev_ioctl with the path field set to the mount point in question and the size field adjusted -appropriately as well as the arg1 field set to the device number of the -containing autofs mount. Upon return the struct field arg1 contains the -uid and arg2 the gid. +appropriately. Upon return the struct field arg1 contains the uid and +arg2 the gid. When reconstructing an autofs mount tree with active mounts we need to re-connect to mounts that may have used the original process uid and -- cgit v0.10.2 From bf72eda5f9c593495127a34d3444b2ec5939e837 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:56 -0700 Subject: autofs: update struct autofs_dev_ioctl in Documentation Sync with changes made by commit 730c9eeca980 ("autofs4: improve parameter usage") which introduced an union for various ioctl commands instead of having statically named arg1,2. This commit simply replaces arg1,2 with the corresponding fields without changing semantics. Link: http://lkml.kernel.org/r/20160812024831.12352.24667.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 540d9a7..50a3e01 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt @@ -179,8 +179,19 @@ struct autofs_dev_ioctl { * including this struct */ __s32 ioctlfd; /* automount command fd */ - __u32 arg1; /* Command parameters */ - __u32 arg2; + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; char path[0]; }; @@ -192,8 +203,8 @@ optionally be used to check a specific mount corresponding to a given mount point file descriptor, and when requesting the uid and gid of the last successful mount on a directory within the autofs file system. -The fields arg1 and arg2 are used to communicate parameters and results of -calls made as described below. +The union is used to communicate parameters and results of calls made +as described below. The path field is used to pass a path where it is needed and the size field is used account for the increased structure length when translating the @@ -245,9 +256,9 @@ AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD Get the major and minor version of the autofs4 protocol version understood by loaded module. This call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to a valid autofs mount point descriptor -and sets the requested version number in structure field arg1. These -commands return 0 on success or one of the negative error codes if -validation fails. +and sets the requested version number in version field of struct args_protover +or sub_version field of struct args_protosubver. These commands return +0 on success or one of the negative error codes if validation fails. AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT @@ -256,9 +267,9 @@ AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT Obtain and release a file descriptor for an autofs managed mount point path. The open call requires an initialized struct autofs_dev_ioctl with the path field set and the size field adjusted appropriately as well -as the arg1 field set to the device number of the autofs mount. The -device number can be obtained from the mount options shown in -/proc/mounts. The close call requires an initialized struct +as the devid field of struct args_openmount set to the device number of +the autofs mount. The device number can be obtained from the mount options +shown in /proc/mounts. The close call requires an initialized struct autofs_dev_ioct with the ioctlfd field set to the descriptor obtained from the open call. The release of the file descriptor can also be done with close(2) so any open descriptors will also be closed at process exit. @@ -272,10 +283,10 @@ AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD Return mount and expire result status from user space to the kernel. Both of these calls require an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open -call and the arg1 field set to the wait queue token number, received -by user space in the foregoing mount or expire request. The arg2 field -is set to the status to be returned. For the ready call this is always -0 and for the fail call it is set to the errno of the operation. +call and the token field of struct args_ready or struct args_fail set +to the wait queue token number, received by user space in the foregoing +mount or expire request. The status field of struct args_fail is set to +the errno of the operation. It is set to 0 on success. AUTOFS_DEV_IOCTL_SETPIPEFD_CMD @@ -290,9 +301,10 @@ mount be catatonic (see next call). The call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open call and -the arg1 field set to descriptor of the pipe. On success the call -also sets the process group id used to identify the controlling process -(eg. the owning automount(8) daemon) to the process group of the caller. +the pipefd field of struct args_setpipefd set to descriptor of the pipe. +On success the call also sets the process group id used to identify the +controlling process (eg. the owning automount(8) daemon) to the process +group of the caller. AUTOFS_DEV_IOCTL_CATATONIC_CMD @@ -323,8 +335,8 @@ mount on the given path dentry. The call requires an initialized struct autofs_dev_ioctl with the path field set to the mount point in question and the size field adjusted -appropriately. Upon return the struct field arg1 contains the uid and -arg2 the gid. +appropriately. Upon return the uid field of struct args_requester contains +the uid and gid field the gid. When reconstructing an autofs mount tree with active mounts we need to re-connect to mounts that may have used the original process uid and @@ -342,8 +354,9 @@ this ioctl is called until no further expire candidates are found. The call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open call. In addition an immediate expire, independent of the mount timeout, can be -requested by setting the arg1 field to 1. If no expire candidates can -be found the ioctl returns -1 with errno set to EAGAIN. +requested by setting the how field of struct args_expire to 1. If no +expire candidates can be found the ioctl returns -1 with errno set to +EAGAIN. This call causes the kernel module to check the mount corresponding to the given ioctlfd for mounts that can be expired, issues an expire @@ -356,7 +369,8 @@ Checks if an autofs mount point is in use. The call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to the descriptor obtained from the open call and -it returns the result in the arg1 field, 1 for busy and 0 otherwise. +it returns the result in the may_umount field of struct args_askumount, +1 for busy and 0 otherwise. AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD @@ -368,12 +382,12 @@ The call requires an initialized struct autofs_dev_ioctl. There are two possible variations. Both use the path field set to the path of the mount point to check and the size field adjusted appropriately. One uses the ioctlfd field to identify a specific mount point to check while the other -variation uses the path and optionally arg1 set to an autofs mount type. -The call returns 1 if this is a mount point and sets arg1 to the device -number of the mount and field arg2 to the relevant super block magic -number (described below) or 0 if it isn't a mountpoint. In both cases -the the device number (as returned by new_encode_dev()) is returned -in field arg1. +variation uses the path and optionally in.type field of struct args_ismountpoint +set to an autofs mount type. The call returns 1 if this is a mount point +and sets out.devid field to the device number of the mount and out.magic +field to the relevant super block magic number (described below) or 0 if +it isn't a mountpoint. In both cases the the device number (as returned +by new_encode_dev()) is returned in out.devid field. If supplied with a file descriptor we're looking for a specific mount, not necessarily at the top of the mounted stack. In this case the path -- cgit v0.10.2 From b6e3795a06c2fe837e7d0319a923e8978276ec4c Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:52:59 -0700 Subject: autofs: fix pr_debug() message This isn't a return value, so change the message to indicate the status is the result of may_umount(). (or locate pr_debug() after put_user() with the same message) Link: http://lkml.kernel.org/r/20160812024836.12352.74628.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2eebeae..a11f731 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -840,7 +840,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) if (may_umount(mnt)) status = 1; - pr_debug("returning %d\n", status); + pr_debug("may umount %d\n", status); status = put_user(status, p); -- cgit v0.10.2 From aa8419367b62044e92bd7993eeb83d861b59bafa Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 11 Oct 2016 13:53:02 -0700 Subject: autofs: fix dev ioctl number range check The count of miscellaneous device ioctls in fs/autofs4/autofs_i.h is wrong. The number of ioctls is the difference between AUTOFS_DEV_IOCTL_VERSION_CMD and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD (14) not the difference between AUTOFS_IOC_COUNT and 11 (21). [kusumi.tomohiro@gmail.com: fix typo that made the count macro negative] Link: http://lkml.kernel.org/r/20160831033420.9910.16809.stgit@pluto.themaw.net Link: http://lkml.kernel.org/r/20160812024841.12352.11975.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Tomohiro Kusumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 73e3d38..477989d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -20,7 +20,8 @@ #define AUTOFS_IOC_COUNT 32 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) -#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) +#define AUTOFS_DEV_IOCTL_IOC_COUNT \ + (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) #include #include diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d47b35a..13e7517 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -642,7 +642,7 @@ static int _autofs_dev_ioctl(unsigned int command, cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || - cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } -- cgit v0.10.2 From d9e1923207f7e099c376d8bd01424423e962aeee Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 11 Oct 2016 13:53:05 -0700 Subject: autofs: add autofs_dev_ioctl_version() for AUTOFS_DEV_IOCTL_VERSION_CMD No functional changes, based on the following justification. 1. Make the code more consistent using the ioctl vector _ioctls[], rather than assigning NULL only for this ioctl command. 2. Remove goto done; for better maintainability in the long run. 3. The existing code is based on the fact that validate_dev_ioctl() sets ioctl version for any command, but AUTOFS_DEV_IOCTL_VERSION_CMD should explicitly set it regardless of the default behavior. Link: http://lkml.kernel.org/r/20160812024846.12352.9885.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 13e7517..7289216 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -172,6 +172,17 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) return sbi; } +/* Return autofs dev ioctl version */ +static int autofs_dev_ioctl_version(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + /* This should have already been set. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + return 0; +} + /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, @@ -590,7 +601,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) int cmd; ioctl_fn fn; } _ioctls[] = { - {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), + autofs_dev_ioctl_version}, {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), autofs_dev_ioctl_protover}, {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), @@ -655,10 +667,6 @@ static int _autofs_dev_ioctl(unsigned int command, if (err) goto out; - /* The validate routine above always sets the version */ - if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) - goto done; - fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); @@ -672,9 +680,11 @@ static int _autofs_dev_ioctl(unsigned int command, /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the - * file during close to allow for immediate release. + * file during close to allow for immediate release, + * and the same for retrieving ioctl version. */ - if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { fp = fget(param->ioctlfd); if (!fp) { @@ -707,7 +717,6 @@ cont: if (fp) fput(fp); -done: if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: -- cgit v0.10.2 From 390855547c3d20cda0646dc7ac83110208303dd6 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:53:08 -0700 Subject: autofs: fix print format for ioctl warning message All other warnings use "cmd(0x%08x)" and this is the only one with "cmd(%d)". (below comes from my userspace debug program, but not automount daemon) [ 1139.905676] autofs4:pid:1640:check_dev_ioctl_version: ioctl control interface version mismatch: kernel(1.0), user(0.0), cmd(-1072131215) Link: http://lkml.kernel.org/r/20160812024851.12352.75458.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 7289216..e89d6bb 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -75,7 +75,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(%d)\n", + "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); -- cgit v0.10.2 From f58b3c91f6786c66483fc18fd8b82a74cbf96d19 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:53:10 -0700 Subject: autofs: move inclusion of linux/limits.h to uapi linux/limits.h should be included by uapi instead of linux/auto_fs.h so as not to cause compile error in userspace. # cat << EOF > ./test1.c > #include > #include > int main(void) { > return 0; > } > EOF # gcc -Wall -g ./test1.c In file included from ./test1.c:2:0: /usr/include/linux/auto_fs.h:54:12: error: 'NAME_MAX' undeclared here (not in a function) char name[NAME_MAX+1]; ^ Link: http://lkml.kernel.org/r/20160812024856.12352.24092.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h index b4066bb..b8f814c 100644 --- a/include/linux/auto_fs.h +++ b/include/linux/auto_fs.h @@ -10,7 +10,6 @@ #define _LINUX_AUTO_FS_H #include -#include #include #include #endif /* _LINUX_AUTO_FS_H */ diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 9175a1b..1bfc3ed 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -12,6 +12,7 @@ #define _UAPI_LINUX_AUTO_FS_H #include +#include #ifndef __KERNEL__ #include #endif /* __KERNEL__ */ -- cgit v0.10.2 From 9b88ee0f3bb4c5b1e721bdcee93601b501d72f0a Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 11 Oct 2016 13:53:13 -0700 Subject: autofs4: move linux/auto_dev-ioctl.h to uapi/linux Since linux/auto_dev-ioctl.h wasn't included in include/linux/Kbuild it wasn't moved to uapi/linux as part of the uapi series. Link: http://lkml.kernel.org/r/20160812024901.12352.10984.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Tomohiro Kusumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h index bf82e3a..28c1505 100644 --- a/include/linux/auto_dev-ioctl.h +++ b/include/linux/auto_dev-ioctl.h @@ -10,212 +10,5 @@ #ifndef _LINUX_AUTO_DEV_IOCTL_H #define _LINUX_AUTO_DEV_IOCTL_H -#include -#include - -#define AUTOFS_DEVICE_NAME "autofs" - -#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 -#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 - -#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) - -/* - * An ioctl interface for autofs mount point control. - */ - -struct args_protover { - __u32 version; -}; - -struct args_protosubver { - __u32 sub_version; -}; - -struct args_openmount { - __u32 devid; -}; - -struct args_ready { - __u32 token; -}; - -struct args_fail { - __u32 token; - __s32 status; -}; - -struct args_setpipefd { - __s32 pipefd; -}; - -struct args_timeout { - __u64 timeout; -}; - -struct args_requester { - __u32 uid; - __u32 gid; -}; - -struct args_expire { - __u32 how; -}; - -struct args_askumount { - __u32 may_umount; -}; - -struct args_ismountpoint { - union { - struct args_in { - __u32 type; - } in; - struct args_out { - __u32 devid; - __u32 magic; - } out; - }; -}; - -/* - * All the ioctls use this structure. - * When sending a path size must account for the total length - * of the chunk of memory otherwise is is the size of the - * structure. - */ - -struct autofs_dev_ioctl { - __u32 ver_major; - __u32 ver_minor; - __u32 size; /* total size of data passed in - * including this struct */ - __s32 ioctlfd; /* automount command fd */ - - /* Command parameters */ - - union { - struct args_protover protover; - struct args_protosubver protosubver; - struct args_openmount openmount; - struct args_ready ready; - struct args_fail fail; - struct args_setpipefd setpipefd; - struct args_timeout timeout; - struct args_requester requester; - struct args_expire expire; - struct args_askumount askumount; - struct args_ismountpoint ismountpoint; - }; - - char path[0]; -}; - -static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) -{ - memset(in, 0, sizeof(struct autofs_dev_ioctl)); - in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; - in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; - in->size = sizeof(struct autofs_dev_ioctl); - in->ioctlfd = -1; -} - -/* - * If you change this make sure you make the corresponding change - * to autofs-dev-ioctl.c:lookup_ioctl() - */ -enum { - /* Get various version info */ - AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, - AUTOFS_DEV_IOCTL_PROTOVER_CMD, - AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, - - /* Open mount ioctl fd */ - AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, - - /* Close mount ioctl fd */ - AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, - - /* Mount/expire status returns */ - AUTOFS_DEV_IOCTL_READY_CMD, - AUTOFS_DEV_IOCTL_FAIL_CMD, - - /* Activate/deactivate autofs mount */ - AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, - AUTOFS_DEV_IOCTL_CATATONIC_CMD, - - /* Expiry timeout */ - AUTOFS_DEV_IOCTL_TIMEOUT_CMD, - - /* Get mount last requesting uid and gid */ - AUTOFS_DEV_IOCTL_REQUESTER_CMD, - - /* Check for eligible expire candidates */ - AUTOFS_DEV_IOCTL_EXPIRE_CMD, - - /* Request busy status */ - AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, - - /* Check if path is a mountpoint */ - AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, -}; - -#define AUTOFS_IOCTL 0x93 - -#define AUTOFS_DEV_IOCTL_VERSION \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_PROTOVER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_OPENMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_READY \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_FAIL \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_SETPIPEFD \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_CATATONIC \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_TIMEOUT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_REQUESTER \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_EXPIRE \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) - -#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ - _IOWR(AUTOFS_IOCTL, \ - AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) - +#include #endif /* _LINUX_AUTO_DEV_IOCTL_H */ diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h new file mode 100644 index 0000000..021ed33 --- /dev/null +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -0,0 +1,221 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#ifndef _UAPI_LINUX_AUTO_DEV_IOCTL_H +#define _UAPI_LINUX_AUTO_DEV_IOCTL_H + +#include +#include + +#define AUTOFS_DEVICE_NAME "autofs" + +#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +/* + * An ioctl interface for autofs mount point control. + */ + +struct args_protover { + __u32 version; +}; + +struct args_protosubver { + __u32 sub_version; +}; + +struct args_openmount { + __u32 devid; +}; + +struct args_ready { + __u32 token; +}; + +struct args_fail { + __u32 token; + __s32 status; +}; + +struct args_setpipefd { + __s32 pipefd; +}; + +struct args_timeout { + __u64 timeout; +}; + +struct args_requester { + __u32 uid; + __u32 gid; +}; + +struct args_expire { + __u32 how; +}; + +struct args_askumount { + __u32 may_umount; +}; + +struct args_ismountpoint { + union { + struct args_in { + __u32 type; + } in; + struct args_out { + __u32 devid; + __u32 magic; + } out; + }; +}; + +/* + * All the ioctls use this structure. + * When sending a path size must account for the total length + * of the chunk of memory otherwise is is the size of the + * structure. + */ + +struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[0]; +}; + +static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) +{ + memset(in, 0, sizeof(struct autofs_dev_ioctl)); + in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + in->size = sizeof(struct autofs_dev_ioctl); + in->ioctlfd = -1; +} + +/* + * If you change this make sure you make the corresponding change + * to autofs-dev-ioctl.c:lookup_ioctl() + */ +enum { + /* Get various version info */ + AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, + AUTOFS_DEV_IOCTL_PROTOVER_CMD, + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, + + /* Open mount ioctl fd */ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, + + /* Close mount ioctl fd */ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, + + /* Mount/expire status returns */ + AUTOFS_DEV_IOCTL_READY_CMD, + AUTOFS_DEV_IOCTL_FAIL_CMD, + + /* Activate/deactivate autofs mount */ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, + AUTOFS_DEV_IOCTL_CATATONIC_CMD, + + /* Expiry timeout */ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, + + /* Get mount last requesting uid and gid */ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, + + /* Check for eligible expire candidates */ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, + + /* Request busy status */ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, + + /* Check if path is a mountpoint */ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, +}; + +#define AUTOFS_IOCTL 0x93 + +#define AUTOFS_DEV_IOCTL_VERSION \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_OPENMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_READY \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_FAIL \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_SETPIPEFD \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CATATONIC \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_TIMEOUT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_REQUESTER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_EXPIRE \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) + +#endif /* _UAPI_LINUX_AUTO_DEV_IOCTL_H */ -- cgit v0.10.2 From 962ca7cfbdc4d072350c94b2d52894b39f41be4a Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:53:16 -0700 Subject: autofs: remove possibly misleading /* #define DEBUG */ Having this in autofs_i.h gives illusion that uncommenting this enables pr_debug(), but it doesn't enable all the pr_debug() in autofs because inclusion order matters. XFS has the same DEBUG macro in its core header fs/xfs/xfs.h, however XFS seems to have a rule to include this prior to other XFS headers as well as kernel headers. This is not the case with autofs, and DEBUG could be enabled via Makefile, so autofs should just get rid of this comment to make the code less confusing. It's a comment, so there is literally no functional difference. Link: http://lkml.kernel.org/r/20160831033409.9910.77067.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 477989d..a1fba42 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -34,8 +34,6 @@ #include #include -/* #define DEBUG */ - #ifdef pr_fmt #undef pr_fmt #endif -- cgit v0.10.2 From fcc24534b0d63556357889ac4fe9d8942677d85e Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 11 Oct 2016 13:53:19 -0700 Subject: autofs: refactor ioctl fn vector in iookup_dev_ioctl() cmd part of this struct is the same as an index of itself within _ioctls[]. In fact this cmd is unused, so we can drop this part. Link: http://lkml.kernel.org/r/20160831033414.9910.66697.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index e89d6bb..fc09eb7 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -597,42 +597,25 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static struct { - int cmd; - ioctl_fn fn; - } _ioctls[] = { - {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), - autofs_dev_ioctl_version}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), - autofs_dev_ioctl_protover}, - {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), - autofs_dev_ioctl_protosubver}, - {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), - autofs_dev_ioctl_openmount}, - {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), - autofs_dev_ioctl_closemount}, - {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), - autofs_dev_ioctl_ready}, - {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), - autofs_dev_ioctl_fail}, - {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), - autofs_dev_ioctl_setpipefd}, - {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), - autofs_dev_ioctl_catatonic}, - {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), - autofs_dev_ioctl_timeout}, - {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), - autofs_dev_ioctl_requester}, - {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), - autofs_dev_ioctl_expire}, - {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), - autofs_dev_ioctl_askumount}, - {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), - autofs_dev_ioctl_ismountpoint} + static ioctl_fn _ioctls[] = { + autofs_dev_ioctl_version, + autofs_dev_ioctl_protover, + autofs_dev_ioctl_protosubver, + autofs_dev_ioctl_openmount, + autofs_dev_ioctl_closemount, + autofs_dev_ioctl_ready, + autofs_dev_ioctl_fail, + autofs_dev_ioctl_setpipefd, + autofs_dev_ioctl_catatonic, + autofs_dev_ioctl_timeout, + autofs_dev_ioctl_requester, + autofs_dev_ioctl_expire, + autofs_dev_ioctl_askumount, + autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; } /* ioctl dispatcher */ -- cgit v0.10.2 From f491bd71118beba608d39ac2d5f1530e1160cd2e Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:22 -0700 Subject: pipe: relocate round_pipe_size() above pipe_set_size() Patch series "pipe: fix limit handling", v2. When changing a pipe's capacity with fcntl(F_SETPIPE_SZ), various limits defined by /proc/sys/fs/pipe-* files are checked to see if unprivileged users are exceeding limits on memory consumption. While documenting and testing the operation of these limits I noticed that, as currently implemented, these checks have a number of problems: (1) When increasing the pipe capacity, the checks against the limits in /proc/sys/fs/pipe-user-pages-{soft,hard} are made against existing consumption, and exclude the memory required for the increased pipe capacity. The new increase in pipe capacity can then push the total memory used by the user for pipes (possibly far) over a limit. This can also trigger the problem described next. (2) The limit checks are performed even when the new pipe capacity is less than the existing pipe capacity. This can lead to problems if a user sets a large pipe capacity, and then the limits are lowered, with the result that the user will no longer be able to decrease the pipe capacity. (3) As currently implemented, accounting and checking against the limits is done as follows: (a) Test whether the user has exceeded the limit. (b) Make new pipe buffer allocation. (c) Account new allocation against the limits. This is racey. Multiple processes may pass point (a) simultaneously, and then allocate pipe buffers that are accounted for only in step (c). The race means that the user's pipe buffer allocation could be pushed over the limit (by an arbitrary amount, depending on how unlucky we were in the race). [Thanks to Vegard Nossum for spotting this point, which I had missed.] This patch series addresses these three problems. This patch (of 8): This is a minor preparatory patch. After subsequent patches, round_pipe_size() will be called from pipe_set_size(), so place round_pipe_size() above pipe_set_size(). Link: http://lkml.kernel.org/r/91a91fdb-a959-ba7f-b551-b62477cc98a1@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index 1f559f0..8773eca 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1008,6 +1008,18 @@ const struct file_operations pipefifo_fops = { }; /* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ @@ -1059,18 +1071,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) } /* - * Currently we rely on the pipe array holding a power-of-2 number - * of pages. - */ -static inline unsigned int round_pipe_size(unsigned int size) -{ - unsigned long nr_pages; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; -} - -/* * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax * will return an error. */ -- cgit v0.10.2 From d37d41666408102bf0ac8e48d8efdce7b809e5f6 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:25 -0700 Subject: pipe: move limit checking logic into pipe_set_size() This is a preparatory patch for following work. Move the F_SETPIPE_SZ limit-checking logic from pipe_fcntl() into pipe_set_size(). This simplifies the code a little, and allows for reworking required in a later patch that fixes the limit checking in pipe_set_size() Link: http://lkml.kernel.org/r/3701b2c5-2c52-2c3e-226d-29b9deb29b50@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index 8773eca..a72b6ff 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1023,9 +1023,24 @@ static inline unsigned int round_pipe_size(unsigned int size) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; + unsigned int size, nr_pages; + + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; + + if (!nr_pages) + return -EINVAL; + + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) + return -EPERM; + + if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't @@ -1109,28 +1124,9 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) __pipe_lock(pipe); switch (cmd) { - case F_SETPIPE_SZ: { - unsigned int size, nr_pages; - - size = round_pipe_size(arg); - nr_pages = size >> PAGE_SHIFT; - - ret = -EINVAL; - if (!nr_pages) - goto out; - - if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { - ret = -EPERM; - goto out; - } else if ((too_many_pipe_buffers_hard(pipe->user) || - too_many_pipe_buffers_soft(pipe->user)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto out; - } - ret = pipe_set_size(pipe, nr_pages); + case F_SETPIPE_SZ: + ret = pipe_set_size(pipe, arg); break; - } case F_GETPIPE_SZ: ret = pipe->buffers * PAGE_SIZE; break; @@ -1139,7 +1135,6 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) break; } -out: __pipe_unlock(pipe); return ret; } -- cgit v0.10.2 From 3734a13b96ebf039b293d8d37a934fd1bd9e03ab Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:28 -0700 Subject: pipe: refactor argument for account_pipe_buffers() This is a preparatory patch for following work. account_pipe_buffers() performs accounting in the 'user_struct'. There is no need to pass a pointer to a 'pipe_inode_info' struct (which is then dereferenced to obtain a pointer to the 'user' field). Instead, pass a pointer directly to the 'user_struct'. This change is needed in preparation for a subsequent patch that the fixes the limit checking in alloc_pipe_info() (and the resulting code is a little more logical). Link: http://lkml.kernel.org/r/7277bf8c-a6fc-4a7d-659c-f5b145c981ab@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index a72b6ff..b9422bc1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -601,10 +601,10 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static void account_pipe_buffers(struct pipe_inode_info *pipe, +static void account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { - atomic_long_add(new - old, &pipe->user->pipe_bufs); + atomic_long_add(new - old, &user->pipe_bufs); } static bool too_many_pipe_buffers_soft(struct user_struct *user) @@ -641,7 +641,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->r_counter = pipe->w_counter = 1; pipe->buffers = pipe_bufs; pipe->user = user; - account_pipe_buffers(pipe, 0, pipe_bufs); + account_pipe_buffers(user, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } @@ -656,7 +656,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - account_pipe_buffers(pipe, pipe->buffers, 0); + account_pipe_buffers(pipe->user, pipe->buffers, 0); free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -1077,7 +1077,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } - account_pipe_buffers(pipe, pipe->buffers, nr_pages); + account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; -- cgit v0.10.2 From b0b91d18e2e97b741b294af9333824ecc3fadfd8 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:31 -0700 Subject: pipe: fix limit checking in pipe_set_size() The limit checking in pipe_set_size() (used by fcntl(F_SETPIPE_SZ)) has the following problems: (1) When increasing the pipe capacity, the checks against the limits in /proc/sys/fs/pipe-user-pages-{soft,hard} are made against existing consumption, and exclude the memory required for the increased pipe capacity. The new increase in pipe capacity can then push the total memory used by the user for pipes (possibly far) over a limit. This can also trigger the problem described next. (2) The limit checks are performed even when the new pipe capacity is less than the existing pipe capacity. This can lead to problems if a user sets a large pipe capacity, and then the limits are lowered, with the result that the user will no longer be able to decrease the pipe capacity. (3) As currently implemented, accounting and checking against the limits is done as follows: (a) Test whether the user has exceeded the limit. (b) Make new pipe buffer allocation. (c) Account new allocation against the limits. This is racey. Multiple processes may pass point (a) simultaneously, and then allocate pipe buffers that are accounted for only in step (c). The race means that the user's pipe buffer allocation could be pushed over the limit (by an arbitrary amount, depending on how unlucky we were in the race). [Thanks to Vegard Nossum for spotting this point, which I had missed.] This patch addresses the above problems as follows: * Perform checks against the limits only when increasing a pipe's capacity; an unprivileged user can always decrease a pipe's capacity. * Alter the checks against limits to include the memory required for the new pipe capacity. * Re-order the accounting step so that it precedes the buffer allocation. If the accounting step determines that a limit has been reached, revert the accounting and cause the operation to fail. The program below can be used to demonstrate problems 1 and 2, and the effect of the fix. The program takes one or more command-line arguments. The first argument specifies the number of pipes that the program should create. The remaining arguments are, alternately, pipe capacities that should be set using fcntl(F_SETPIPE_SZ), and sleep intervals (in seconds) between the fcntl() operations. (The sleep intervals allow the possibility to change the limits between fcntl() operations.) Problem 1 ========= Using the test program on an unpatched kernel, we first set some limits: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # 40.96 MB Then show that we can set a pipe with capacity (100MB) that is over the hard limit # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes F_SETPIPE_SZ returned 134217728 Now set the capacity to 100MB twice. The second call fails (which is probably surprising to most users, since it seems like a no-op): # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 0 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes F_SETPIPE_SZ returned 134217728 Loop 2: set pipe capacity to 100000000 bytes Loop 2, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted With a patched kernel, setting a capacity over the limit fails at the first attempt: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # sudo -u mtk ./test_F_SETPIPE_SZ 1 100000000 Initial pipe capacity: 65536 Loop 1: set pipe capacity to 100000000 bytes Loop 1, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted There is a small chance that the change to fix this problem could break user-space, since there are cases where fcntl(F_SETPIPE_SZ) calls that previously succeeded might fail. However, the chances are small, since (a) the pipe-user-pages-{soft,hard} limits are new (in 4.5), and the default soft/hard limits are high/unlimited. Therefore, it seems warranted to make these limits operate more precisely (and behave more like what users probably expect). Problem 2 ========= Running the test program on an unpatched kernel, we first set some limits: # getconf PAGESIZE 4096 # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # 40.96 MB Now perform two fcntl(F_SETPIPE_SZ) operations on a single pipe, first setting a pipe capacity (10MB), sleeping for a few seconds, during which time the hard limit is lowered, and then set pipe capacity to a smaller amount (5MB): # sudo -u mtk ./test_F_SETPIPE_SZ 1 10000000 15 5000000 & [1] 748 # Initial pipe capacity: 65536 Loop 1: set pipe capacity to 10000000 bytes F_SETPIPE_SZ returned 16777216 Sleeping 15 seconds # echo 1000 > /proc/sys/fs/pipe-user-pages-hard # 4.096 MB # Loop 2: set pipe capacity to 5000000 bytes Loop 2, pipe 0: F_SETPIPE_SZ failed: fcntl: Operation not permitted In this case, the user should be able to lower the limit. With a kernel that has the patch below, the second fcntl() succeeds: # echo 0 > /proc/sys/fs/pipe-user-pages-soft # echo 1000000000 > /proc/sys/fs/pipe-max-size # echo 10000 > /proc/sys/fs/pipe-user-pages-hard # sudo -u mtk ./test_F_SETPIPE_SZ 1 10000000 15 5000000 & [1] 3215 # Initial pipe capacity: 65536 # Loop 1: set pipe capacity to 10000000 bytes F_SETPIPE_SZ returned 16777216 Sleeping 15 seconds # echo 1000 > /proc/sys/fs/pipe-user-pages-hard # Loop 2: set pipe capacity to 5000000 bytes F_SETPIPE_SZ returned 8388608 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- /* test_F_SETPIPE_SZ.c (C) 2016, Michael Kerrisk; licensed under GNU GPL version 2 or later Test operation of fcntl(F_SETPIPE_SZ) for setting pipe capacity and interactions with limits defined by /proc/sys/fs/pipe-* files. */ #define _GNU_SOURCE #include #include #include #include int main(int argc, char *argv[]) { int (*pfd)[2]; int npipes; int pcap, rcap; int j, p, s, stime, loop; if (argc < 2) { fprintf(stderr, "Usage: %s num-pipes " "[pipe-capacity sleep-time]...\n", argv[0]); exit(EXIT_FAILURE); } npipes = atoi(argv[1]); pfd = calloc(npipes, sizeof (int [2])); if (pfd == NULL) { perror("calloc"); exit(EXIT_FAILURE); } for (j = 0; j < npipes; j++) { if (pipe(pfd[j]) == -1) { fprintf(stderr, "Loop %d: pipe() failed: ", j); perror("pipe"); exit(EXIT_FAILURE); } } printf("Initial pipe capacity: %d\n", fcntl(pfd[0][0], F_GETPIPE_SZ)); for (j = 2; j < argc; j += 2 ) { loop = j / 2; pcap = atoi(argv[j]); printf(" Loop %d: set pipe capacity to %d bytes\n", loop, pcap); for (p = 0; p < npipes; p++) { s = fcntl(pfd[p][0], F_SETPIPE_SZ, pcap); if (s == -1) { fprintf(stderr, " Loop %d, pipe %d: F_SETPIPE_SZ " "failed: ", loop, p); perror("fcntl"); exit(EXIT_FAILURE); } if (p == 0) { printf(" F_SETPIPE_SZ returned %d\n", s); rcap = s; } else { if (s != rcap) { fprintf(stderr, " Loop %d, pipe %d: F_SETPIPE_SZ " "unexpected return: %d\n", loop, p, s); exit(EXIT_FAILURE); } } stime = (j + 1 < argc) ? atoi(argv[j + 1]) : 0; if (stime > 0) { printf(" Sleeping %d seconds\n", stime); sleep(stime); } } } exit(EXIT_SUCCESS); } 8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x---8x--- Patch history: v2 * Switch order of test in 'if' statement to avoid function call (to capability()) in normal path. [This is a fix to a preexisting wart in the code. Thanks to Willy Tarreau] * Perform (size > pipe_max_size) check before calling account_pipe_buffers(). [Thanks to Vegard Nossum] Quoting Vegard: The potential problem happens if the user passes a very large number which will overflow pipe->user->pipe_bufs. On 32-bit, sizeof(int) == sizeof(long), so if they pass arg = INT_MAX then round_pipe_size() returns INT_MAX. Although it's true that the accounting is done in terms of pages and not bytes, so you'd need on the order of (1 << 13) = 8192 processes hitting the limit at the same time in order to make it overflow, which seems a bit unlikely. (See https://lkml.org/lkml/2016/8/12/215 for another discussion on the limit checking) Link: http://lkml.kernel.org/r/1e464945-536b-2420-798b-e77b9c7e8593@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index b9422bc1..dc5f4c0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1027,6 +1027,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; unsigned int size, nr_pages; + long ret = 0; size = round_pipe_size(arg); nr_pages = size >> PAGE_SHIFT; @@ -1034,13 +1035,26 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) if (!nr_pages) return -EINVAL; - if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) + /* + * If trying to increase the pipe capacity, check that an + * unprivileged user is not trying to exceed various limits + * (soft limit check here, hard limit check just below). + * Decreasing the pipe capacity is always permitted, even + * if the user is currently over a limit. + */ + if (nr_pages > pipe->buffers && + size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if ((too_many_pipe_buffers_hard(pipe->user) || - too_many_pipe_buffers_soft(pipe->user)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - return -EPERM; + account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + + if (nr_pages > pipe->buffers && + (too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_revert_acct; + } /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't @@ -1048,13 +1062,17 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (nr_pages < pipe->nrbufs) - return -EBUSY; + if (nr_pages < pipe->nrbufs) { + ret = -EBUSY; + goto out_revert_acct; + } bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) - return -ENOMEM; + if (unlikely(!bufs)) { + ret = -ENOMEM; + goto out_revert_acct; + } /* * The pipe array wraps around, so just start the new one at zero @@ -1077,12 +1095,15 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } - account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->buffers = nr_pages; return nr_pages * PAGE_SIZE; + +out_revert_acct: + account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + return ret; } /* -- cgit v0.10.2 From 09b4d1990094dd22c27fb0163534db419458569c Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:34 -0700 Subject: pipe: simplify logic in alloc_pipe_info() Replace an 'if' block that covers most of the code in this function with a 'goto'. This makes the code a little simpler to read, and also simplifies the next patch (fix limit checking in alloc_pipe_info()) Link: http://lkml.kernel.org/r/aef030c1-0257-98a9-4988-186efa48530c@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index dc5f4c0..eaa3c8d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -622,33 +622,34 @@ static bool too_many_pipe_buffers_hard(struct user_struct *user) struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); - if (pipe) { - unsigned long pipe_bufs = PIPE_DEF_BUFFERS; - struct user_struct *user = get_current_user(); - - if (!too_many_pipe_buffers_hard(user)) { - if (too_many_pipe_buffers_soft(user)) - pipe_bufs = 1; - pipe->bufs = kcalloc(pipe_bufs, - sizeof(struct pipe_buffer), - GFP_KERNEL_ACCOUNT); - } + if (pipe == NULL) + goto out_free_uid; + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kcalloc(pipe_bufs, + sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); + } - if (pipe->bufs) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = pipe_bufs; - pipe->user = user; - account_pipe_buffers(user, 0, pipe_bufs); - mutex_init(&pipe->mutex); - return pipe; - } - free_uid(user); - kfree(pipe); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(user, 0, pipe_bufs); + mutex_init(&pipe->mutex); + return pipe; } + kfree(pipe); +out_free_uid: + free_uid(user); return NULL; } -- cgit v0.10.2 From a005ca0e6813e1d796a7422a7e31d8b8d6555df1 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:37 -0700 Subject: pipe: fix limit checking in alloc_pipe_info() The limit checking in alloc_pipe_info() (used by pipe(2) and when opening a FIFO) has the following problems: (1) When checking capacity required for the new pipe, the checks against the limit in /proc/sys/fs/pipe-user-pages-{soft,hard} are made against existing consumption, and exclude the memory required for the new pipe capacity. As a consequence: (1) the memory allocation throttling provided by the soft limit does not kick in quite as early as it should, and (2) the user can overrun the hard limit. (2) As currently implemented, accounting and checking against the limits is done as follows: (a) Test whether the user has exceeded the limit. (b) Make new pipe buffer allocation. (c) Account new allocation against the limits. This is racey. Multiple processes may pass point (a) simultaneously, and then allocate pipe buffers that are accounted for only in step (c). The race means that the user's pipe buffer allocation could be pushed over the limit (by an arbitrary amount, depending on how unlucky we were in the race). [Thanks to Vegard Nossum for spotting this point, which I had missed.] This patch addresses the above problems as follows: * Alter the checks against limits to include the memory required for the new pipe. * Re-order the accounting step so that it precedes the buffer allocation. If the accounting step determines that a limit has been reached, revert the accounting and cause the operation to fail. Link: http://lkml.kernel.org/r/8ff3e9f9-23f6-510c-644f-8e70cd1c0bd9@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index eaa3c8d..83a2924 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -629,24 +629,30 @@ struct pipe_inode_info *alloc_pipe_info(void) if (pipe == NULL) goto out_free_uid; - if (!too_many_pipe_buffers_hard(user)) { - if (too_many_pipe_buffers_soft(user)) - pipe_bufs = 1; - pipe->bufs = kcalloc(pipe_bufs, - sizeof(struct pipe_buffer), - GFP_KERNEL_ACCOUNT); + account_pipe_buffers(user, 0, pipe_bufs); + + if (too_many_pipe_buffers_soft(user)) { + account_pipe_buffers(user, pipe_bufs, 1); + pipe_bufs = 1; } + if (too_many_pipe_buffers_hard(user)) + goto out_revert_acct; + + pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; pipe->buffers = pipe_bufs; pipe->user = user; - account_pipe_buffers(user, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } +out_revert_acct: + account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); -- cgit v0.10.2 From 9c87bcf0a31b338dc8a69a5d251a037565a94e13 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:40 -0700 Subject: pipe: make account_pipe_buffers() return a value, and use it This is an optional patch, to provide a small performance improvement. Alter account_pipe_buffers() so that it returns the new value in user->pipe_bufs. This means that we can refactor too_many_pipe_buffers_soft() and too_many_pipe_buffers_hard() to avoid the costs of repeated use of atomic_long_read() to get the value user->pipe_bufs. Link: http://lkml.kernel.org/r/93e5f193-1e5e-3e1f-3a20-eae79b7e1310@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index 83a2924..5ff02c4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -601,22 +601,20 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -static void account_pipe_buffers(struct user_struct *user, +static unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { - atomic_long_add(new - old, &user->pipe_bufs); + return atomic_long_add_return(new - old, &user->pipe_bufs); } -static bool too_many_pipe_buffers_soft(struct user_struct *user) +static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; + return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; } -static bool too_many_pipe_buffers_hard(struct user_struct *user) +static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && - atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; + return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; } struct pipe_inode_info *alloc_pipe_info(void) @@ -624,19 +622,20 @@ struct pipe_inode_info *alloc_pipe_info(void) struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); + unsigned long user_bufs; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; - account_pipe_buffers(user, 0, pipe_bufs); + user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user)) { - account_pipe_buffers(user, pipe_bufs, 1); + if (too_many_pipe_buffers_soft(user_bufs)) { + user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user)) + if (too_many_pipe_buffers_hard(user_bufs)) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -652,7 +651,7 @@ struct pipe_inode_info *alloc_pipe_info(void) } out_revert_acct: - account_pipe_buffers(user, pipe_bufs, 0); + (void) account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); @@ -663,7 +662,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; - account_pipe_buffers(pipe->user, pipe->buffers, 0); + (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; @@ -1034,6 +1033,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; unsigned int size, nr_pages; + unsigned long user_bufs; long ret = 0; size = round_pipe_size(arg); @@ -1053,11 +1053,11 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; - account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); + user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); if (nr_pages > pipe->buffers && - (too_many_pipe_buffers_hard(pipe->user) || - too_many_pipe_buffers_soft(pipe->user)) && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out_revert_acct; @@ -1109,7 +1109,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) return nr_pages * PAGE_SIZE; out_revert_acct: - account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); + (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); return ret; } -- cgit v0.10.2 From 086e774a57fba4695f14383c0818994c0b31da7c Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Tue, 11 Oct 2016 13:53:43 -0700 Subject: pipe: cap initial pipe capacity according to pipe-max-size limit This is a patch that provides behavior that is more consistent, and probably less surprising to users. I consider the change optional, and welcome opinions about whether it should be applied. By default, pipes are created with a capacity of 64 kiB. However, /proc/sys/fs/pipe-max-size may be set smaller than this value. In this scenario, an unprivileged user could thus create a pipe whose initial capacity exceeds the limit. Therefore, it seems logical to cap the initial pipe capacity according to the value of pipe-max-size. The test program shown earlier in this patch series can be used to demonstrate the effect of the change brought about with this patch: # cat /proc/sys/fs/pipe-max-size 1048576 # sudo -u mtk ./test_F_SETPIPE_SZ 1 Initial pipe capacity: 65536 # echo 10000 > /proc/sys/fs/pipe-max-size # cat /proc/sys/fs/pipe-max-size 16384 # sudo -u mtk ./test_F_SETPIPE_SZ 1 Initial pipe capacity: 16384 # ./test_F_SETPIPE_SZ 1 Initial pipe capacity: 65536 The last two executions of 'test_F_SETPIPE_SZ' show that pipe-max-size caps the initial allocation for a new pipe for unprivileged users, but not for privileged users. Link: http://lkml.kernel.org/r/31dc7064-2a17-9c5b-1df1-4e3012ee992c@gmail.com Signed-off-by: Michael Kerrisk Reviewed-by: Vegard Nossum Cc: Willy Tarreau Cc: Cc: Tetsuo Handa Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/pipe.c b/fs/pipe.c index 5ff02c4..8e0d9f2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -628,6 +628,9 @@ struct pipe_inode_info *alloc_pipe_info(void) if (pipe == NULL) goto out_free_uid; + if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = pipe_max_size >> PAGE_SHIFT; + user_bufs = account_pipe_buffers(user, 0, pipe_bufs); if (too_many_pipe_buffers_soft(user_bufs)) { -- cgit v0.10.2 From 0a5bf409d3eefc1ca64cedf0bc1c0673164cacc1 Mon Sep 17 00:00:00 2001 From: Ales Novak Date: Tue, 11 Oct 2016 13:53:46 -0700 Subject: ptrace: clear TIF_SYSCALL_TRACE on ptrace detach On __ptrace_detach(), called from do_exit()->exit_notify()-> forget_original_parent()->exit_ptrace(), the TIF_SYSCALL_TRACE in thread->flags of the tracee is not cleared up. This results in the tracehook_report_syscall_* being called (though there's no longer a tracer listening to that) upon its further syscalls. Example scenario - attach "strace" to a running process and kill it (the strace) with SIGKILL. You'll see that the syscall trace hooks are still being called. The clearing of this flag should be moved from ptrace_detach() to __ptrace_detach(). Link: http://lkml.kernel.org/r/1472759493-20554-1-git-send-email-alnovak@suse.cz Signed-off-by: Ales Novak Acked-by: Oleg Nesterov Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1d3b766..2a99027 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,6 +73,8 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + child->parent = child->real_parent; list_del_init(&child->ptrace_entry); @@ -489,7 +491,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* -- cgit v0.10.2 From 7836a2d9803c6203d2922e7116307c9f0dfe87f9 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 11 Oct 2016 13:53:49 -0700 Subject: rapidio/rio_cm: use memdup_user() instead of duplicating code Fix coccinelle warning about duplicating existing memdup_user function. Link: http://lkml.kernel.org/r/20160811151737.20140-1-alexandre.bounine@idt.com Link: https://lkml.org/lkml/2016/8/11/29 Signed-off-by: Alexandre Bounine Reported-by: kbuild test robot Cc: Matt Porter Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index cebc296..bad0e0e 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -1841,24 +1841,19 @@ static int cm_chan_msg_send(void __user *arg) { struct rio_cm_msg msg; void *buf; - int ret = 0; + int ret; if (copy_from_user(&msg, arg, sizeof(msg))) return -EFAULT; if (msg.size > RIO_MAX_MSG_SIZE) return -EINVAL; - buf = kmalloc(msg.size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, (void __user *)(uintptr_t)msg.msg, msg.size)) { - ret = -EFAULT; - goto out; - } + buf = memdup_user((void __user *)(uintptr_t)msg.msg, msg.size); + if (IS_ERR(buf)) + return PTR_ERR(buf); ret = riocm_ch_send(msg.ch_num, buf, msg.size); -out: + kfree(buf); return ret; } -- cgit v0.10.2 From 99fdafdeacfa99ca9047641b684fa2aaf094a661 Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:53:52 -0700 Subject: random: simplify API for random address requests To date, all callers of randomize_range() have set the length to 0, and check for a zero return value. For the current callers, the only way to get zero returned is if end <= start. Since they are all adding a constant to the start address, this is unnecessary. We can remove a bunch of needless checks by simplifying the API to do just what everyone wants, return an address between [start, start + range). While we're here, s/get_random_int/get_random_long/. No current call site is adversely affected by get_random_int(), since all current range requests are < UINT_MAX. However, we should match caller expectations to avoid coming up short (ha!) in the future. All current callers to randomize_range() chose to use the start address if randomize_range() failed. Therefore, we simplify things by just returning the start address on error. randomize_range() will be removed once all callers have been converted over to randomize_addr(). Link: http://lkml.kernel.org/r/20160803233913.32511-2-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: Michael Ellerman Cc: "Roberts, William C" Cc: Yann Droneaud Cc: Russell King Cc: "Theodore Ts'o" Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Nick Kralevich Cc: Jeffrey Vander Stoep Cc: Daniel Cashman Cc: Chris Metcalf Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/char/random.c b/drivers/char/random.c index 3efb3bf..40eb07e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2119,6 +2119,39 @@ randomize_range(unsigned long start, unsigned long end, unsigned long len) return PAGE_ALIGN(get_random_int() % range + start); } +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long +randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + /* Interface for in-kernel drivers of true hardware RNGs. * Those devices may produce endless random bits and will be throttled * when our pool is full. diff --git a/include/linux/random.h b/include/linux/random.h index 3d6e981..2674189 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -35,6 +35,7 @@ extern const struct file_operations random_fops, urandom_fops; unsigned int get_random_int(void); unsigned long get_random_long(void); unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); +unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); void prandom_bytes(void *buf, size_t nbytes); -- cgit v0.10.2 From 9c6f0902a55226b08b9703408f04b258b9afa0bc Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:53:56 -0700 Subject: x86: use simpler API for random address requests Currently, all callers to randomize_range() set the length to 0 and calculate end by adding a constant to the start address. We can simplify the API to remove a bunch of needless checks and variables. Use the new randomize_addr(start, range) call to set the requested address. Link: http://lkml.kernel.org/r/20160803233913.32511-3-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 28cea78..0888a87 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -509,8 +509,7 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + return randomize_page(mm->brk, 0x02000000); } /* diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 10e0272..a55ed63 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -101,7 +101,6 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { - unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -112,9 +111,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, *begin = 0x40000000; *end = 0x80000000; if (current->flags & PF_RANDOMIZE) { - new_begin = randomize_range(*begin, *begin + 0x02000000, 0); - if (new_begin) - *begin = new_begin; + *begin = randomize_page(*begin, 0x02000000); } } else { *begin = current->mm->mmap_legacy_base; -- cgit v0.10.2 From c984cbf2e34cd622b5531f776029f7b23ff17e50 Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:53:59 -0700 Subject: ARM: use simpler API for random address requests Currently, all callers to randomize_range() set the length to 0 and calculate end by adding a constant to the start address. We can simplify the API to remove a bunch of needless checks and variables. Use the new randomize_addr(start, range) call to set the requested address. Link: http://lkml.kernel.org/r/20160803233913.32511-4-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: "Russell King - ARM Linux" Cc: "Theodore Ts'o" Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 612eb53..91d2d5b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -318,8 +318,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long arch_randomize_brk(struct mm_struct *mm) { - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + return randomize_page(mm->brk, 0x02000000); } #ifdef CONFIG_MMU -- cgit v0.10.2 From fa5114c78c596e977af04865b697b7fcb092a0fb Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:54:02 -0700 Subject: arm64: use simpler API for random address requests Currently, all callers to randomize_range() set the length to 0 and calculate end by adding a constant to the start address. We can simplify the API to remove a bunch of needless checks and variables. Use the new randomize_addr(start, range) call to set the requested address. Link: http://lkml.kernel.org/r/20160803233913.32511-5-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Will Deacon Acked-by: Kees Cook Cc: "Russell King - ARM Linux" Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a4f5f76..27b2f13 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -372,12 +372,8 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { - unsigned long range_end = mm->brk; - if (is_compat_task()) - range_end += 0x02000000; + return randomize_page(mm->brk, 0x02000000); else - range_end += 0x40000000; - - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + return randomize_page(mm->brk, 0x40000000); } -- cgit v0.10.2 From 09fddbaf90aa9fc0abd80332a191fbfd0158e22b Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:54:05 -0700 Subject: tile: use simpler API for random address requests Currently, all callers to randomize_range() set the length to 0 and calculate end by adding a constant to the start address. We can simplify the API to remove a bunch of needless checks and variables. Use the new randomize_addr(start, range) call to set the requested address. Link: http://lkml.kernel.org/r/20160803233913.32511-6-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: "Theodore Ts'o" Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/tile/mm/mmap.c b/arch/tile/mm/mmap.c index 851a94e..ef61c59 100644 --- a/arch/tile/mm/mmap.c +++ b/arch/tile/mm/mmap.c @@ -88,6 +88,5 @@ void arch_pick_mmap_layout(struct mm_struct *mm) unsigned long arch_randomize_brk(struct mm_struct *mm) { - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + return randomize_page(mm->brk, 0x02000000); } -- cgit v0.10.2 From 05c2679e9562f7e9ca11dacc91cf21b6c42d532d Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:54:08 -0700 Subject: unicore32: use simpler API for random address requests Currently, all callers to randomize_range() set the length to 0 and calculate end by adding a constant to the start address. We can simplify the API to remove a bunch of needless checks and variables. Use the new randomize_addr(start, range) call to set the requested address. Link: http://lkml.kernel.org/r/20160803233913.32511-7-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: "Theodore Ts'o" Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 00299c9..d7c6b67 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -295,8 +295,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long arch_randomize_brk(struct mm_struct *mm) { - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + return randomize_page(mm->brk, 0x02000000); } /* -- cgit v0.10.2 From 7425154d3bbf5fcc7554738cab6dfac559ffbdda Mon Sep 17 00:00:00 2001 From: Jason Cooper Date: Tue, 11 Oct 2016 13:54:11 -0700 Subject: random: remove unused randomize_range() All call sites for randomize_range have been updated to use the much simpler and more robust randomize_addr(). Remove the now unnecessary code. Link: http://lkml.kernel.org/r/20160803233913.32511-8-jason@lakedaemon.net Signed-off-by: Jason Cooper Acked-by: Kees Cook Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/char/random.c b/drivers/char/random.c index 40eb07e..d131e15 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2100,25 +2100,6 @@ unsigned long get_random_long(void) } EXPORT_SYMBOL(get_random_long); -/* - * randomize_range() returns a start address such that - * - * [...... .....] - * start end - * - * a with size "len" starting at the return value is inside in the - * area defined by [start, end], but is otherwise randomized. - */ -unsigned long -randomize_range(unsigned long start, unsigned long end, unsigned long len) -{ - unsigned long range = end - len - start; - - if (end <= start + len) - return 0; - return PAGE_ALIGN(get_random_int() % range + start); -} - /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. diff --git a/include/linux/random.h b/include/linux/random.h index 2674189..f7bb7a3 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -34,7 +34,6 @@ extern const struct file_operations random_fops, urandom_fops; unsigned int get_random_int(void); unsigned long get_random_long(void); -unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len); unsigned long randomize_page(unsigned long start, unsigned long range); u32 prandom_u32(void); -- cgit v0.10.2 From a9a62c9384417545620aee1b5ad1d9357350c17a Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Tue, 11 Oct 2016 13:54:14 -0700 Subject: dma-mapping: introduce the DMA_ATTR_NO_WARN attribute Introduce the DMA_ATTR_NO_WARN attribute, and document it. Link: http://lkml.kernel.org/r/1470092390-25451-2-git-send-email-mauricfo@linux.vnet.ibm.com Signed-off-by: Mauricio Faria de Oliveira Cc: Keith Busch Cc: Jens Axboe Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/DMA-attributes.txt b/Documentation/DMA-attributes.txt index 2d455a5..98bf7ac 100644 --- a/Documentation/DMA-attributes.txt +++ b/Documentation/DMA-attributes.txt @@ -126,3 +126,20 @@ means that we won't try quite as hard to get them. NOTE: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM, though ARM64 patches will likely be posted soon. + +DMA_ATTR_NO_WARN +---------------- + +This tells the DMA-mapping subsystem to suppress allocation failure reports +(similarly to __GFP_NOWARN). + +On some architectures allocation failures are reported with error messages +to the system logs. Although this can help to identify and debug problems, +drivers which handle failures (eg, retry later) have no problems with them, +and can actually flood the system logs with error messages that aren't any +problem at all, depending on the implementation of the retry mechanism. + +So, this provides a way for drivers to avoid those error messages on calls +where allocation failures are not a problem, and shouldn't bother the logs. + +NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0f90eb5..08528af 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -56,6 +56,11 @@ * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) +/* + * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress + * allocation failure reports (similarly to __GFP_NOWARN). + */ +#define DMA_ATTR_NO_WARN (1UL << 8) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. -- cgit v0.10.2 From af8a24988e48f9ed20acf4d5230ac216d5baf723 Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Tue, 11 Oct 2016 13:54:17 -0700 Subject: powerpc: implement the DMA_ATTR_NO_WARN attribute Add support for the DMA_ATTR_NO_WARN attribute on powerpc iommu code. Link: http://lkml.kernel.org/r/1470092390-25451-3-git-send-email-mauricfo@linux.vnet.ibm.com Signed-off-by: Mauricio Faria de Oliveira Acked-by: Michael Ellerman Cc: Keith Busch Cc: Jens Axboe Cc: Benjamin Herrenschmidt Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 37d6e74..5f202a5 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -479,7 +479,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, /* Handle failure */ if (unlikely(entry == DMA_ERROR_CODE)) { - if (printk_ratelimit()) + if (!(attrs & DMA_ATTR_NO_WARN) && + printk_ratelimit()) dev_info(dev, "iommu_alloc failed, tbl %p " "vaddr %lx npages %lu\n", tbl, vaddr, npages); @@ -776,7 +777,8 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, mask >> tbl->it_page_shift, align, attrs); if (dma_handle == DMA_ERROR_CODE) { - if (printk_ratelimit()) { + if (!(attrs & DMA_ATTR_NO_WARN) && + printk_ratelimit()) { dev_info(dev, "iommu_alloc failed, tbl %p " "vaddr %p npages %d\n", tbl, vaddr, npages); -- cgit v0.10.2 From 2b6b535d9158b822a45080b3d6d5b2993fd49e5a Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Tue, 11 Oct 2016 13:54:20 -0700 Subject: nvme: use the DMA_ATTR_NO_WARN attribute Use the DMA_ATTR_NO_WARN attribute for the dma_map_sg() call of the nvme driver that returns BLK_MQ_RQ_QUEUE_BUSY (not for BLK_MQ_RQ_QUEUE_ERROR). Link: http://lkml.kernel.org/r/1470092390-25451-4-git-send-email-mauricfo@linux.vnet.ibm.com Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Gabriel Krisman Bertazi Cc: Keith Busch Cc: Jens Axboe Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 68ef187..0fc99f0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -515,7 +515,8 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, goto out; ret = BLK_MQ_RQ_QUEUE_BUSY; - if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) + if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, + DMA_ATTR_NO_WARN)) goto out; if (!nvme_setup_prps(dev, req, size)) -- cgit v0.10.2 From 0ee59413c967c35a6dd2dbdab605b4cd42025ee5 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 11 Oct 2016 13:54:23 -0700 Subject: x86/panic: replace smp_send_stop() with kdump friendly version in panic path Daniel Walker reported problems which happens when crash_kexec_post_notifiers kernel option is enabled (https://lkml.org/lkml/2015/6/24/44). In that case, smp_send_stop() is called before entering kdump routines which assume other CPUs are still online. As the result, for x86, kdump routines fail to save other CPUs' registers and disable virtualization extensions. To fix this problem, call a new kdump friendly function, crash_smp_send_stop(), instead of the smp_send_stop() when crash_kexec_post_notifiers is enabled. crash_smp_send_stop() is a weak function, and it just call smp_send_stop(). Architecture codes should override it so that kdump can work appropriately. This patch only provides x86-specific version. For Xen's PV kernel, just keep the current behavior. NOTES: - Right solution would be to place crash_smp_send_stop() before __crash_kexec() invocation in all cases and remove smp_send_stop(), but we can't do that until all architectures implement own crash_smp_send_stop() - crash_smp_send_stop()-like work is still needed by machine_crash_shutdown() because crash_kexec() can be called without entering panic() Fixes: f06e5153f4ae (kernel/panic.c: add "crash_kexec_post_notifiers" option) Link: http://lkml.kernel.org/r/20160810080948.11028.15344.stgit@sysi4-13.yrl.intra.hitachi.co.jp Signed-off-by: Hidehiro Kawai Reported-by: Daniel Walker Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Eric Biederman Cc: Masami Hiramatsu Cc: Daniel Walker Cc: Xunlei Pang Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: David Vrabel Cc: Toshi Kani Cc: Ralf Baechle Cc: David Daney Cc: Aaro Koskinen Cc: "Steven J. Hill" Cc: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index d2434c1..282630e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -210,6 +210,7 @@ struct kexec_entry64_regs { typedef void crash_vmclear_fn(void); extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; +extern void kdump_nmi_shootdown_cpus(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 19980b3..026ea82 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -47,6 +47,7 @@ struct smp_ops { void (*smp_cpus_done)(unsigned max_cpus); void (*stop_other_cpus)(int wait); + void (*crash_stop_other_cpus)(void); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu, struct task_struct *tidle); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9616cf7..650830e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -133,15 +133,31 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) disable_local_APIC(); } -static void kdump_nmi_shootdown_cpus(void) +void kdump_nmi_shootdown_cpus(void) { nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); } +/* Override the weak function in kernel/panic.c */ +void crash_smp_send_stop(void) +{ + static int cpus_stopped; + + if (cpus_stopped) + return; + + if (smp_ops.crash_stop_other_cpus) + smp_ops.crash_stop_other_cpus(); + else + smp_send_stop(); + + cpus_stopped = 1; +} + #else -static void kdump_nmi_shootdown_cpus(void) +void crash_smp_send_stop(void) { /* There are no cpus to shootdown */ } @@ -160,7 +176,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - kdump_nmi_shootdown_cpus(); + crash_smp_send_stop(); /* * VMCLEAR VMCSs loaded on this cpu if needed. diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 658777c..68f8cc2 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -32,6 +32,8 @@ #include #include #include +#include + /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -342,6 +344,9 @@ struct smp_ops smp_ops = { .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, +#if defined(CONFIG_KEXEC_CORE) + .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, +#endif .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/kernel/panic.c b/kernel/panic.c index ca8cea1..e6480e2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -71,6 +71,32 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) panic_smp_self_stop(); } +/* + * Stop other CPUs in panic. Architecture dependent code may override this + * with more suitable version. For example, if the architecture supports + * crash dump, it should save registers of each stopped CPU and disable + * per-CPU features such as virtualization extensions. + */ +void __weak crash_smp_send_stop(void) +{ + static int cpus_stopped; + + /* + * This function can be called twice in panic path, but obviously + * we execute this only once. + */ + if (cpus_stopped) + return; + + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ + smp_send_stop(); + cpus_stopped = 1; +} + atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); /* @@ -164,14 +190,21 @@ void panic(const char *fmt, ...) if (!_crash_kexec_post_notifiers) { printk_nmi_flush_on_panic(); __crash_kexec(NULL); - } - /* - * Note smp_send_stop is the usual smp shutdown function, which - * unfortunately means it may not be hardened to work in a panic - * situation. - */ - smp_send_stop(); + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a + * panic situation. + */ + smp_send_stop(); + } else { + /* + * If we want to do crash dump after notifier calls and + * kmsg_dump, we will need architecture dependent extra + * works in addition to stopping other CPUs. + */ + crash_smp_send_stop(); + } /* * Run any panic handlers, including those that might need to -- cgit v0.10.2 From 54c721b857fd45f3ad3bda695ee4f472518db02a Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 11 Oct 2016 13:54:26 -0700 Subject: mips/panic: replace smp_send_stop() with kdump friendly version in panic path Daniel Walker reported problems which happens when crash_kexec_post_notifiers kernel option is enabled (https://lkml.org/lkml/2015/6/24/44). In that case, smp_send_stop() is called before entering kdump routines which assume other CPUs are still online. As the result, kdump routines fail to save other CPUs' registers. Additionally for MIPS OCTEON, it misses to stop the watchdog timer. To fix this problem, call a new kdump friendly function, crash_smp_send_stop(), instead of the smp_send_stop() when crash_kexec_post_notifiers is enabled. crash_smp_send_stop() is a weak function, and it just call smp_send_stop(). Architecture codes should override it so that kdump can work appropriately. This patch provides MIPS version. Fixes: f06e5153f4ae (kernel/panic.c: add "crash_kexec_post_notifiers" option) Link: http://lkml.kernel.org/r/20160810080950.11028.28000.stgit@sysi4-13.yrl.intra.hitachi.co.jp Signed-off-by: Hidehiro Kawai Reported-by: Daniel Walker Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Eric Biederman Cc: Masami Hiramatsu Cc: Daniel Walker Cc: Xunlei Pang Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: David Vrabel Cc: Toshi Kani Cc: Ralf Baechle Cc: David Daney Cc: Aaro Koskinen Cc: "Steven J. Hill" Cc: Corey Minyard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index cb16fcc..5537f95 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -267,6 +267,17 @@ static void octeon_crash_shutdown(struct pt_regs *regs) default_machine_crash_shutdown(regs); } +#ifdef CONFIG_SMP +void octeon_crash_smp_send_stop(void) +{ + int cpu; + + /* disable watchdogs */ + for_each_online_cpu(cpu) + cvmx_write_csr(CVMX_CIU_WDOGX(cpu_logical_map(cpu)), 0); +} +#endif + #endif /* CONFIG_KEXEC */ #ifdef CONFIG_CAVIUM_RESERVE32 @@ -911,6 +922,9 @@ void __init prom_init(void) _machine_kexec_shutdown = octeon_shutdown; _machine_crash_shutdown = octeon_crash_shutdown; _machine_kexec_prepare = octeon_kexec_prepare; +#ifdef CONFIG_SMP + _crash_smp_send_stop = octeon_crash_smp_send_stop; +#endif #endif octeon_user_io_init(); diff --git a/arch/mips/include/asm/kexec.h b/arch/mips/include/asm/kexec.h index ee25ebb..493a3cc 100644 --- a/arch/mips/include/asm/kexec.h +++ b/arch/mips/include/asm/kexec.h @@ -45,6 +45,7 @@ extern const unsigned char kexec_smp_wait[]; extern unsigned long secondary_kexec_args[4]; extern void (*relocated_kexec_smp_wait) (void *); extern atomic_t kexec_ready_to_reboot; +extern void (*_crash_smp_send_stop)(void); #endif #endif diff --git a/arch/mips/kernel/crash.c b/arch/mips/kernel/crash.c index 610f0f3..1723b17 100644 --- a/arch/mips/kernel/crash.c +++ b/arch/mips/kernel/crash.c @@ -47,9 +47,14 @@ static void crash_shutdown_secondary(void *passed_regs) static void crash_kexec_prepare_cpus(void) { + static int cpus_stopped; unsigned int msecs; + unsigned int ncpus; - unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ + if (cpus_stopped) + return; + + ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ dump_send_ipi(crash_shutdown_secondary); smp_wmb(); @@ -64,6 +69,17 @@ static void crash_kexec_prepare_cpus(void) cpu_relax(); mdelay(1); } + + cpus_stopped = 1; +} + +/* Override the weak function in kernel/panic.c */ +void crash_smp_send_stop(void) +{ + if (_crash_smp_send_stop) + _crash_smp_send_stop(); + + crash_kexec_prepare_cpus(); } #else /* !defined(CONFIG_SMP) */ diff --git a/arch/mips/kernel/machine_kexec.c b/arch/mips/kernel/machine_kexec.c index 50980bf3..5972520 100644 --- a/arch/mips/kernel/machine_kexec.c +++ b/arch/mips/kernel/machine_kexec.c @@ -25,6 +25,7 @@ void (*_machine_crash_shutdown)(struct pt_regs *regs) = NULL; #ifdef CONFIG_SMP void (*relocated_kexec_smp_wait) (void *); atomic_t kexec_ready_to_reboot = ATOMIC_INIT(0); +void (*_crash_smp_send_stop)(void) = NULL; #endif int -- cgit v0.10.2 From ab47deb6bb03ddf1c1e253bfbbdbf5f867975ca5 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 11 Oct 2016 13:54:30 -0700 Subject: pps: kc: fix non-tickless system config dependency CONFIG_NO_HZ currently only sets the default value of dynticks config so if PPS kernel consumer needs periodic timer ticks it should depend on !CONFIG_NO_HZ_COMMON instead of !CONFIG_NO_HZ. Otherwise it is possible to enable it even on tickless system which has CONFIG_NO_HZ not set and CONFIG_NO_HZ_IDLE (or CONFIG_NO_HZ_FULL) set. Link: http://lkml.kernel.org/r/57E2B769.50202@maciej.szmigiero.name Signed-off-by: Maciej S. Szmigiero Acked-by: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig index 7512e98..564a51a 100644 --- a/drivers/pps/Kconfig +++ b/drivers/pps/Kconfig @@ -31,7 +31,7 @@ config PPS_DEBUG config NTP_PPS bool "PPS kernel consumer support" - depends on !NO_HZ + depends on !NO_HZ_COMMON help This option adds support for direct in-kernel time synchronization using an external PPS signal. -- cgit v0.10.2 From 26b5679e437ef4f83db66437981c7c0d569973b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Oct 2016 13:54:33 -0700 Subject: relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra Signed-off-by: Akash Goel Cc: Tom Zanussi Cc: Chris Wilson Cc: Tvrtko Ursulin Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/relay.h b/include/linux/relay.h index ecbb34a..68c1448 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,7 @@ struct rchan_buf size_t subbufs_consumed; /* count of sub-buffers consumed */ struct rchan *chan; /* associated channel */ wait_queue_head_t read_wait; /* reader wait queue */ - struct timer_list timer; /* reader wake-up timer */ + struct irq_work wakeup_work; /* reader wakeup */ struct dentry *dentry; /* channel file dentry */ struct kref kref; /* channel buffer refcount */ struct page **page_array; /* array of current buffer pages */ diff --git a/init/Kconfig b/init/Kconfig index d7fc226..34407f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1288,6 +1288,7 @@ config SYSFS_DEPRECATED_V2 config RELAY bool "Kernel->user space relay support (formerly relayfs)" + select IRQ_WORK help This option enables support for relay interface support in certain file systems (such as debugfs). diff --git a/kernel/relay.c b/kernel/relay.c index 9988f5c..da79a10 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -328,13 +328,15 @@ static struct rchan_callbacks default_channel_callbacks = { /** * wakeup_readers - wake up readers waiting on a channel - * @data: contains the channel buffer + * @work: contains the channel buffer * - * This is the timer function used to defer reader waking. + * This is the function used to defer reader waking */ -static void wakeup_readers(unsigned long data) +static void wakeup_readers(struct irq_work *work) { - struct rchan_buf *buf = (struct rchan_buf *)data; + struct rchan_buf *buf; + + buf = container_of(work, struct rchan_buf, wakeup_work); wake_up_interruptible(&buf->read_wait); } @@ -352,9 +354,10 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) if (init) { init_waitqueue_head(&buf->read_wait); kref_init(&buf->kref); - setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); - } else - del_timer_sync(&buf->timer); + init_irq_work(&buf->wakeup_work, wakeup_readers); + } else { + irq_work_sync(&buf->wakeup_work); + } buf->subbufs_produced = 0; buf->subbufs_consumed = 0; @@ -487,7 +490,7 @@ free_buf: static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; - del_timer_sync(&buf->timer); + irq_work_sync(&buf->wakeup_work); buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } @@ -754,14 +757,15 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->early_bytes += buf->chan->subbuf_size - buf->padding[old_subbuf]; smp_mb(); - if (waitqueue_active(&buf->read_wait)) + if (waitqueue_active(&buf->read_wait)) { /* * Calling wake_up_interruptible() from here * will deadlock if we happen to be logging * from the scheduler (trying to re-grab * rq->lock), so defer it. */ - mod_timer(&buf->timer, jiffies + 1); + irq_work_queue(&buf->wakeup_work); + } } old = buf->data; -- cgit v0.10.2 From a2c6a235dbf4318fc7f7981932478e6c47f093ab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 11 Oct 2016 13:54:36 -0700 Subject: config/android: Remove CONFIG_IPV6_PRIVACY Option is long gone, see commit 5d9efa7ee99e ("ipv6: Remove privacy config option.") Link: http://lkml.kernel.org/r/20160811170340.9859-1-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 9f748ed..fcf2de8 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -41,7 +41,6 @@ CONFIG_IPV6=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y CONFIG_IP_ADVANCED_ROUTER=y -- cgit v0.10.2 From f023a3956f273859ed36f624f75a66c272124b16 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:38 -0700 Subject: config: android: move device mapper options to recommended CONFIG_MD is in recommended, but other dependent options like DM_CRYPT and DM_VERITY options are in base. The result is the options in base don't get enabled when applying both base and recommended fragments. Move all the options to recommended. Link: http://lkml.kernel.org/r/20160908185934.18098-1-robh@kernel.org Signed-off-by: Rob Herring Acked-by: John Stultz Cc: Amit Pundir Cc: Dmitry Shmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index fcf2de8..da8b3e9 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -11,7 +11,6 @@ CONFIG_ANDROID_LOW_MEMORY_KILLER=y CONFIG_ARMV8_DEPRECATED=y CONFIG_ASHMEM=y CONFIG_AUDIT=y -CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y CONFIG_CGROUP_CPUACCT=y @@ -19,9 +18,6 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index e3b953e..297756b 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,12 +6,16 @@ # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_COMPACTION=y CONFIG_DEBUG_RODATA=y +CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y CONFIG_DRAGONRISE_FF=y CONFIG_ENABLE_DEFAULT_TRACERS=y CONFIG_EXT4_FS=y -- cgit v0.10.2 From d90ae51a3e7556c9f50431db43cd8190934ccf94 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:41 -0700 Subject: config: android: set SELinux as default security mode Android won't boot without SELinux enabled, so make it the default. Link: http://lkml.kernel.org/r/20160908185934.18098-2-robh@kernel.org Signed-off-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index da8b3e9..248070e 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -18,6 +18,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DEFAULT_SECURITY_SELINUX=y CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HIGH_RES_TIMERS=y -- cgit v0.10.2 From 2489a1771aae86062c637f0eb7a06129681208b1 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 11 Oct 2016 13:54:44 -0700 Subject: config: android: enable CONFIG_SECCOMP As of Android N, SECCOMP is required. Without it, we will get mediaextractor error: E /system/bin/mediaextractor: libminijail: prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER): Invalid argument Link: http://lkml.kernel.org/r/20160908185934.18098-3-robh@kernel.org Signed-off-by: Rob Herring Acked-by: John Stultz Cc: Amit Pundir Cc: Dmitry Shmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 248070e..1a8f34f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -131,6 +131,7 @@ CONFIG_PREEMPT=y CONFIG_QUOTA=y CONFIG_RTC_CLASS=y CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y -- cgit v0.10.2 From 65deb8af76defeae4b114a75242ed15b0bcba173 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 11 Oct 2016 13:54:47 -0700 Subject: kcov: do not instrument lib/stackdepot.c There's no point in collecting coverage from lib/stackdepot.c, as it is not a function of syscall inputs. Disabling kcov instrumentation for that file will reduce the coverage noise level. Link: http://lkml.kernel.org/r/1474640972-104131-1-git-send-email-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Dmitry Vyukov Cc: Kostya Serebryany Cc: Andrey Konovalov Cc: syzkaller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Makefile b/lib/Makefile index f3ca8c0..50144a3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -180,6 +180,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +KCOV_INSTRUMENT_stackdepot.o := n libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o -- cgit v0.10.2 From 5864a2fd3088db73d47942370d0f7210a807b9bc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 11 Oct 2016 13:54:50 -0700 Subject: ipc/sem.c: fix complex_count vs. simple op race Commit 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()") introduced a race: sem_lock has a fast path that allows parallel simple operations. There are two reasons why a simple operation cannot run in parallel: - a non-simple operations is ongoing (sma->sem_perm.lock held) - a complex operation is sleeping (sma->complex_count != 0) As both facts are stored independently, a thread can bypass the current checks by sleeping in the right positions. See below for more details (or kernel bugzilla 105651). The patch fixes that by creating one variable (complex_mode) that tracks both reasons why parallel operations are not possible. The patch also updates stale documentation regarding the locking. With regards to stable kernels: The patch is required for all kernels that include the commit 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()") (3.10?) The alternative is to revert the patch that introduced the race. The patch is safe for backporting, i.e. it makes no assumptions about memory barriers in spin_unlock_wait(). Background: Here is the race of the current implementation: Thread A: (simple op) - does the first "sma->complex_count == 0" test Thread B: (complex op) - does sem_lock(): This includes an array scan. But the scan can't find Thread A, because Thread A does not own sem->lock yet. - the thread does the operation, increases complex_count, drops sem_lock, sleeps Thread A: - spin_lock(&sem->lock), spin_is_locked(sma->sem_perm.lock) - sleeps before the complex_count test Thread C: (complex op) - does sem_lock (no array scan, complex_count==1) - wakes up Thread B. - decrements complex_count Thread A: - does the complex_count test Bug: Now both thread A and thread C operate on the same array, without any synchronization. Fixes: 6d07b68ce16a ("ipc/sem.c: optimize sem_lock()") Link: http://lkml.kernel.org/r/1469123695-5661-1-git-send-email-manfred@colorfullife.com Reported-by: Cc: "H. Peter Anvin" Cc: Peter Zijlstra Cc: Davidlohr Bueso Cc: Thomas Gleixner Cc: Ingo Molnar Cc: <1vier1@web.de> Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sem.h b/include/linux/sem.h index 976ce3a..d0efd6e 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,6 +21,7 @@ struct sem_array { struct list_head list_id; /* undo requests on this array */ int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ + bool complex_mode; /* no parallel simple ops */ }; #ifdef CONFIG_SYSVIPC diff --git a/ipc/sem.c b/ipc/sem.c index 7c9d4f7..5e318c5 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -162,14 +162,21 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); /* * Locking: + * a) global sem_lock() for read/write * sem_undo.id_next, * sem_array.complex_count, - * sem_array.pending{_alter,_cont}, - * sem_array.sem_undo: global sem_lock() for read/write - * sem_undo.proc_next: only "current" is allowed to read/write that field. + * sem_array.complex_mode + * sem_array.pending{_alter,_const}, + * sem_array.sem_undo * + * b) global or semaphore sem_lock() for read/write: * sem_array.sem_base[i].pending_{const,alter}: - * global or semaphore sem_lock() for read/write + * sem_array.complex_mode (for read) + * + * c) special: + * sem_undo_list.list_proc: + * * undo_list->lock for write + * * rcu for read */ #define sc_semmsl sem_ctls[0] @@ -260,30 +267,61 @@ static void sem_rcu_free(struct rcu_head *head) } /* - * Wait until all currently ongoing simple ops have completed. + * Enter the mode suitable for non-simple operations: * Caller must own sem_perm.lock. - * New simple ops cannot start, because simple ops first check - * that sem_perm.lock is free. - * that a) sem_perm.lock is free and b) complex_count is 0. */ -static void sem_wait_array(struct sem_array *sma) +static void complexmode_enter(struct sem_array *sma) { int i; struct sem *sem; - if (sma->complex_count) { - /* The thread that increased sma->complex_count waited on - * all sem->lock locks. Thus we don't need to wait again. - */ + if (sma->complex_mode) { + /* We are already in complex_mode. Nothing to do */ return; } + /* We need a full barrier after seting complex_mode: + * The write to complex_mode must be visible + * before we read the first sem->lock spinlock state. + */ + smp_store_mb(sma->complex_mode, true); + for (i = 0; i < sma->sem_nsems; i++) { sem = sma->sem_base + i; spin_unlock_wait(&sem->lock); } + /* + * spin_unlock_wait() is not a memory barriers, it is only a + * control barrier. The code must pair with spin_unlock(&sem->lock), + * thus just the control barrier is insufficient. + * + * smp_rmb() is sufficient, as writes cannot pass the control barrier. + */ + smp_rmb(); +} + +/* + * Try to leave the mode that disallows simple operations: + * Caller must own sem_perm.lock. + */ +static void complexmode_tryleave(struct sem_array *sma) +{ + if (sma->complex_count) { + /* Complex ops are sleeping. + * We must stay in complex mode + */ + return; + } + /* + * Immediately after setting complex_mode to false, + * a simple op can start. Thus: all memory writes + * performed by the current operation must be visible + * before we set complex_mode to false. + */ + smp_store_release(&sma->complex_mode, false); } +#define SEM_GLOBAL_LOCK (-1) /* * If the request contains only one semaphore operation, and there are * no complex transactions pending, lock only the semaphore involved. @@ -300,56 +338,42 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Complex operation - acquire a full lock */ ipc_lock_object(&sma->sem_perm); - /* And wait until all simple ops that are processed - * right now have dropped their locks. - */ - sem_wait_array(sma); - return -1; + /* Prevent parallel simple ops */ + complexmode_enter(sma); + return SEM_GLOBAL_LOCK; } /* * Only one semaphore affected - try to optimize locking. - * The rules are: - * - optimized locking is possible if no complex operation - * is either enqueued or processed right now. - * - The test for enqueued complex ops is simple: - * sma->complex_count != 0 - * - Testing for complex ops that are processed right now is - * a bit more difficult. Complex ops acquire the full lock - * and first wait that the running simple ops have completed. - * (see above) - * Thus: If we own a simple lock and the global lock is free - * and complex_count is now 0, then it will stay 0 and - * thus just locking sem->lock is sufficient. + * Optimized locking is possible if no complex operation + * is either enqueued or processed right now. + * + * Both facts are tracked by complex_mode. */ sem = sma->sem_base + sops->sem_num; - if (sma->complex_count == 0) { + /* + * Initial check for complex_mode. Just an optimization, + * no locking, no memory barrier. + */ + if (!sma->complex_mode) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); - /* Then check that the global lock is free */ - if (!spin_is_locked(&sma->sem_perm.lock)) { - /* - * We need a memory barrier with acquire semantics, - * otherwise we can race with another thread that does: - * complex_count++; - * spin_unlock(sem_perm.lock); - */ - smp_acquire__after_ctrl_dep(); + /* + * See 51d7d5205d33 + * ("powerpc: Add smp_mb() to arch_spin_is_locked()"): + * A full barrier is required: the write of sem->lock + * must be visible before the read is executed + */ + smp_mb(); - /* - * Now repeat the test of complex_count: - * It can't change anymore until we drop sem->lock. - * Thus: if is now 0, then it will stay 0. - */ - if (sma->complex_count == 0) { - /* fast path successful! */ - return sops->sem_num; - } + if (!smp_load_acquire(&sma->complex_mode)) { + /* fast path successful! */ + return sops->sem_num; } spin_unlock(&sem->lock); } @@ -369,15 +393,16 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, /* Not a false alarm, thus complete the sequence for a * full lock. */ - sem_wait_array(sma); - return -1; + complexmode_enter(sma); + return SEM_GLOBAL_LOCK; } } static inline void sem_unlock(struct sem_array *sma, int locknum) { - if (locknum == -1) { + if (locknum == SEM_GLOBAL_LOCK) { unmerge_queues(sma); + complexmode_tryleave(sma); ipc_unlock_object(&sma->sem_perm); } else { struct sem *sem = sma->sem_base + locknum; @@ -529,6 +554,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) } sma->complex_count = 0; + sma->complex_mode = true; /* dropped by sem_unlock below */ INIT_LIST_HEAD(&sma->pending_alter); INIT_LIST_HEAD(&sma->pending_const); INIT_LIST_HEAD(&sma->list_id); @@ -2184,10 +2210,10 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls * ipc_lock_object() directly (in sysvipc_find_ipc). - * In order to stay compatible with sem_lock(), we must wait until - * all simple semop() calls have left their critical regions. + * In order to stay compatible with sem_lock(), we must + * enter / leave complex_mode. */ - sem_wait_array(sma); + complexmode_enter(sma); sem_otime = get_semotime(sma); @@ -2204,6 +2230,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) sem_otime, sma->sem_ctime); + complexmode_tryleave(sma); + return 0; } #endif -- cgit v0.10.2 From ee51636ca54f9d4d01ae49b1740742e9db54d868 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 11 Oct 2016 13:54:53 -0700 Subject: ipc/msg: implement lockless pipelined wakeups This patch moves the wakeup_process() invocation so it is not done under the ipc global lock by making use of a lockless wake_q. With this change, the waiter is woken up once the message has been assigned and it does not need to loop on SMP if the message points to NULL. In the signal case we still need to check the pointer under the lock to verify the state. This change should also avoid the introduction of preempt_disable() in -RT which avoids a busy-loop which pools for the NULL -> !NULL change if the waiter has a higher priority compared to the waker. By making use of wake_qs, the logic of sysv msg queues is greatly simplified (and very well suited as we can batch lockless wakeups), particularly around the lockless receive algorithm. This has been tested with Manred's pmsg-shared tool on a "AMD A10-7800 Radeon R7, 12 Compute Cores 4C+8G": test | before | after | diff -----------------|------------|------------|---------- pmsg-shared 8 60 | 19,347,422 | 30,442,191 | + ~57.34 % pmsg-shared 4 60 | 21,367,197 | 35,743,458 | + ~67.28 % pmsg-shared 2 60 | 22,884,224 | 24,278,200 | + ~6.09 % Link: http://lkml.kernel.org/r/1469748819-19484-2-git-send-email-dave@stgolabs.net Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index c6521c2..b1fb06a 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -51,13 +51,7 @@ struct msg_receiver { long r_msgtype; long r_maxsize; - /* - * Mark r_msg volatile so that the compiler - * does not try to get smart and optimize - * it. We rely on this for the lockless - * receive algorithm. - */ - struct msg_msg *volatile r_msg; + struct msg_msg *r_msg; }; /* one msg_sender for each sleeping sender */ @@ -183,21 +177,14 @@ static void ss_wakeup(struct list_head *h, int kill) } } -static void expunge_all(struct msg_queue *msq, int res) +static void expunge_all(struct msg_queue *msq, int res, + struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { - msr->r_msg = NULL; /* initialize expunge ordering */ - wake_up_process(msr->r_tsk); - /* - * Ensure that the wakeup is visible before setting r_msg as - * the receiving end depends on it: either spinning on a nil, - * or dealing with -EAGAIN cases. See lockless receive part 1 - * and 2 in do_msgrcv(). - */ - smp_wmb(); /* barrier (B) */ - msr->r_msg = ERR_PTR(res); + wake_q_add(wake_q, msr->r_tsk); + WRITE_ONCE(msr->r_msg, ERR_PTR(res)); } } @@ -213,11 +200,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); + WAKE_Q(wake_q); - expunge_all(msq, -EIDRM); + expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); + wake_up_q(&wake_q); rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { @@ -342,6 +331,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct kern_ipc_perm *ipcp; struct msqid64_ds uninitialized_var(msqid64); struct msg_queue *msq; + WAKE_Q(wake_q); int err; if (cmd == IPC_SET) { @@ -389,7 +379,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, /* sleeping receivers might be excluded by * stricter permissions. */ - expunge_all(msq, -EAGAIN); + expunge_all(msq, -EAGAIN, &wake_q); /* sleeping senders might be able to send * due to a larger queue size. */ @@ -402,6 +392,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, out_unlock0: ipc_unlock_object(&msq->q_perm); + wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); out_up: @@ -566,7 +557,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode) return 0; } -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, + struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; @@ -577,27 +569,14 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { - /* initialize pipelined send ordering */ - msr->r_msg = NULL; - wake_up_process(msr->r_tsk); - /* barrier (B) see barrier comment below */ - smp_wmb(); - msr->r_msg = ERR_PTR(-E2BIG); + wake_q_add(wake_q, msr->r_tsk); + WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG)); } else { - msr->r_msg = NULL; msq->q_lrpid = task_pid_vnr(msr->r_tsk); msq->q_rtime = get_seconds(); - wake_up_process(msr->r_tsk); - /* - * Ensure that the wakeup is visible before - * setting r_msg, as the receiving can otherwise - * exit - once r_msg is set, the receiver can - * continue. See lockless receive part 1 and 2 - * in do_msgrcv(). Barrier (B). - */ - smp_wmb(); - msr->r_msg = msg; + wake_q_add(wake_q, msr->r_tsk); + WRITE_ONCE(msr->r_msg, msg); return 1; } } @@ -613,6 +592,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, struct msg_msg *msg; int err; struct ipc_namespace *ns; + WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; @@ -686,7 +666,6 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = -EIDRM; goto out_unlock0; } - ss_del(&s); if (signal_pending(current)) { @@ -698,7 +677,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); - if (!pipelined_send(msq, msg)) { + if (!pipelined_send(msq, msg, &wake_q)) { /* no one is waiting for this message, enqueue it */ list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; @@ -712,6 +691,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, out_unlock0: ipc_unlock_object(&msq->q_perm); + wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (msg != NULL) @@ -919,71 +899,38 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl rcu_read_unlock(); schedule(); - /* Lockless receive, part 1: - * Disable preemption. We don't hold a reference to the queue - * and getting a reference would defeat the idea of a lockless - * operation, thus the code relies on rcu to guarantee the - * existence of msq: + /* + * Lockless receive, part 1: + * We don't hold a reference to the queue and getting a + * reference would defeat the idea of a lockless operation, + * thus the code relies on rcu to guarantee the existence of + * msq: * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. - * rcu_read_lock() prevents preemption between reading r_msg - * and acquiring the q_perm.lock in ipc_lock_object(). */ rcu_read_lock(); - /* Lockless receive, part 2: - * Wait until pipelined_send or expunge_all are outside of - * wake_up_process(). There is a race with exit(), see - * ipc/mqueue.c for the details. The correct serialization - * ensures that a receiver cannot continue without the wakeup - * being visibible _before_ setting r_msg: - * - * CPU 0 CPU 1 - * - * smp_rmb(); (A) <-- pair -. - * r_msg> | msr->r_msg = NULL; - * | wake_up_process(); - * `------> smp_wmb(); (B) - * msr->r_msg = msg; + /* + * Lockless receive, part 2: + * The work in pipelined_send() and expunge_all(): + * - Set pointer to message + * - Queue the receiver task for later wakeup + * - Wake up the process after the lock is dropped. * - * Where (A) orders the message value read and where (B) orders - * the write to the r_msg -- done in both pipelined_send and - * expunge_all. - */ - for (;;) { - /* - * Pairs with writer barrier in pipelined_send - * or expunge_all. - */ - smp_rmb(); /* barrier (A) */ - msg = (struct msg_msg *)msr_d.r_msg; - if (msg) - break; - - /* - * The cpu_relax() call is a compiler barrier - * which forces everything in this loop to be - * re-loaded. - */ - cpu_relax(); - } - - /* Lockless receive, part 3: - * If there is a message or an error then accept it without - * locking. + * Should the process wake up before this wakeup (due to a + * signal) it will either see the message and continue ... */ + msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock1; - /* Lockless receive, part 3: - * Acquire the queue spinlock. - */ + /* + * ... or see -EAGAIN, acquire the lock to check the message + * again. + */ ipc_lock_object(&msq->q_perm); - /* Lockless receive, part 4: - * Repeat test after acquiring the spinlock. - */ - msg = (struct msg_msg *)msr_d.r_msg; + msg = msr_d.r_msg; if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; -- cgit v0.10.2 From e3658538bf3727383b4e563fbab83c04d615508a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 11 Oct 2016 13:54:56 -0700 Subject: ipc/msg: batch queue sender wakeups Currently the use of wake_qs in sysv msg queues are only for the receiver tasks that are blocked on the queue. But blocked sender tasks (due to queue size constraints) still are awoken with the ipc object lock held, which can be a problem particularly for small sized queues and far from gracious for -rt (just like it was for the receiver side). The paths that actually wakeup a sender are obviously related to when we are either getting rid of the queue or after (some) space is freed-up after a receiver takes the msg (msgrcv). Furthermore, with the exception of msgrcv, we can always piggy-back on expunge_all that has its own tasks lined-up for waking. Finally, upon unlinking the message, it should be no problem delaying the wakeups a bit until after we've released the lock. Link: http://lkml.kernel.org/r/1469748819-19484-3-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index b1fb06a..d320024 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -166,14 +166,15 @@ static inline void ss_del(struct msg_sender *mss) list_del(&mss->list); } -static void ss_wakeup(struct list_head *h, int kill) +static void ss_wakeup(struct list_head *h, + struct wake_q_head *wake_q, int kill) { struct msg_sender *mss, *t; list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; - wake_up_process(mss->tsk); + wake_q_add(wake_q, mss->tsk); } } @@ -203,7 +204,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); - ss_wakeup(&msq->q_senders, 1); + ss_wakeup(&msq->q_senders, &wake_q, 1); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); @@ -331,7 +332,6 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct kern_ipc_perm *ipcp; struct msqid64_ds uninitialized_var(msqid64); struct msg_queue *msq; - WAKE_Q(wake_q); int err; if (cmd == IPC_SET) { @@ -362,6 +362,9 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, freeque(ns, ipcp); goto out_up; case IPC_SET: + { + WAKE_Q(wake_q); + if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; @@ -376,15 +379,21 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, msq->q_qbytes = msqid64.msg_qbytes; msq->q_ctime = get_seconds(); - /* sleeping receivers might be excluded by + /* + * Sleeping receivers might be excluded by * stricter permissions. */ expunge_all(msq, -EAGAIN, &wake_q); - /* sleeping senders might be able to send + /* + * Sleeping senders might be able to send * due to a larger queue size. */ - ss_wakeup(&msq->q_senders, 0); - break; + ss_wakeup(&msq->q_senders, &wake_q, 0); + ipc_unlock_object(&msq->q_perm); + wake_up_q(&wake_q); + + goto out_unlock1; + } default: err = -EINVAL; goto out_unlock1; @@ -392,7 +401,6 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, out_unlock0: ipc_unlock_object(&msq->q_perm); - wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); out_up: @@ -809,6 +817,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; + WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; @@ -873,7 +882,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl msq->q_cbytes -= msg->m_ts; atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); - ss_wakeup(&msq->q_senders, 0); + ss_wakeup(&msq->q_senders, &wake_q, 0); goto out_unlock0; } @@ -945,6 +954,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl out_unlock0: ipc_unlock_object(&msq->q_perm); + wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (IS_ERR(msg)) { -- cgit v0.10.2 From d0d6a2a95e80e63827ea1ca184754a990438c072 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 11 Oct 2016 13:54:59 -0700 Subject: ipc/msg: make ss_wakeup() kill arg boolean ... 'tis annoying. Link: http://lkml.kernel.org/r/1469748819-19484-4-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index d320024..3c44bbc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -167,7 +167,7 @@ static inline void ss_del(struct msg_sender *mss) } static void ss_wakeup(struct list_head *h, - struct wake_q_head *wake_q, int kill) + struct wake_q_head *wake_q, bool kill) { struct msg_sender *mss, *t; @@ -204,7 +204,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); - ss_wakeup(&msq->q_senders, &wake_q, 1); + ss_wakeup(&msq->q_senders, &wake_q, true); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); @@ -388,7 +388,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, * Sleeping senders might be able to send * due to a larger queue size. */ - ss_wakeup(&msq->q_senders, &wake_q, 0); + ss_wakeup(&msq->q_senders, &wake_q, false); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); @@ -882,7 +882,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl msq->q_cbytes -= msg->m_ts; atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); - ss_wakeup(&msq->q_senders, &wake_q, 0); + ss_wakeup(&msq->q_senders, &wake_q, false); goto out_unlock0; } -- cgit v0.10.2 From ed27f9122c541a1720898739ac55f824f820b7ff Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 11 Oct 2016 13:55:02 -0700 Subject: ipc/msg: avoid waking sender upon full queue Blocked tasks queued in q_senders waiting for their message to fit in the queue are blindly awoken every time we think there's a remote chance this might happen. This could cause numerous (and expensive -- thundering herd-ish) bogus wakeups if the queue is still really full. Adding to the scheduling cost/overhead, there's also the fact that we need to take the ipc object lock and requeue ourselves in the q_senders list. By keeping track of the blocked sender's message size, we can know previously if the wakeup ought to occur or not. Otherwise, to maintain the current wakeup order we just move it to the tail. This is exactly what occurs right now if the sender needs to go back to sleep. The case of EIDRM is left completely untouched, as we need to wakeup all the tasks, and shouldn't be playing games in the first place. This patch was seen to save on the 'msgctl10' ltp testcase ~15% in context switches (avg out of ten runs). Although these tests are really about functionality (as opposed to performance), is does show the direct benefits of the optimization. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1469748819-19484-6-git-send-email-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/msg.c b/ipc/msg.c index 3c44bbc..e12307d 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -58,6 +58,7 @@ struct msg_receiver { struct msg_sender { struct list_head list; struct task_struct *tsk; + size_t msgsz; }; #define SEARCH_ANY 1 @@ -153,27 +154,60 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) return msq->q_perm.id; } -static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss) +static inline bool msg_fits_inqueue(struct msg_queue *msq, size_t msgsz) +{ + return msgsz + msq->q_cbytes <= msq->q_qbytes && + 1 + msq->q_qnum <= msq->q_qbytes; +} + +static inline void ss_add(struct msg_queue *msq, + struct msg_sender *mss, size_t msgsz) { mss->tsk = current; + mss->msgsz = msgsz; __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } static inline void ss_del(struct msg_sender *mss) { - if (mss->list.next != NULL) + if (mss->list.next) list_del(&mss->list); } -static void ss_wakeup(struct list_head *h, +static void ss_wakeup(struct msg_queue *msq, struct wake_q_head *wake_q, bool kill) { struct msg_sender *mss, *t; + struct task_struct *stop_tsk = NULL; + struct list_head *h = &msq->q_senders; list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; + + /* + * Stop at the first task we don't wakeup, + * we've already iterated the original + * sender queue. + */ + else if (stop_tsk == mss->tsk) + break; + /* + * We are not in an EIDRM scenario here, therefore + * verify that we really need to wakeup the task. + * To maintain current semantics and wakeup order, + * move the sender to the tail on behalf of the + * blocked task. + */ + else if (!msg_fits_inqueue(msq, mss->msgsz)) { + if (!stop_tsk) + stop_tsk = mss->tsk; + + list_move_tail(&mss->list, &msq->q_senders); + continue; + } + wake_q_add(wake_q, mss->tsk); } } @@ -204,7 +238,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); - ss_wakeup(&msq->q_senders, &wake_q, true); + ss_wakeup(msq, &wake_q, true); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); @@ -388,7 +422,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, * Sleeping senders might be able to send * due to a larger queue size. */ - ss_wakeup(&msq->q_senders, &wake_q, false); + ss_wakeup(msq, &wake_q, false); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); @@ -642,10 +676,8 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, if (err) goto out_unlock0; - if (msgsz + msq->q_cbytes <= msq->q_qbytes && - 1 + msq->q_qnum <= msq->q_qbytes) { + if (msg_fits_inqueue(msq, msgsz)) break; - } /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { @@ -654,7 +686,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, } /* enqueue the sender and prepare to block */ - ss_add(msq, &s); + ss_add(msq, &s, msgsz); if (!ipc_rcu_getref(msq)) { err = -EIDRM; @@ -682,6 +714,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, } } + msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); @@ -882,7 +915,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl msq->q_cbytes -= msg->m_ts; atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); - ss_wakeup(&msq->q_senders, &wake_q, false); + ss_wakeup(msq, &wake_q, false); goto out_unlock0; } -- cgit v0.10.2 From 2a1613a586de91457fa93c3e468a6e2438fe52a0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 11 Oct 2016 13:55:05 -0700 Subject: ipc/sem.c: add cond_resched in exit_sme In CONFIG_PREEMPT=n kernel a softlockup was observed while the for loop in exit_sem. Apparently it's possible for the loop to take quite a long time and it doesn't have a scheduling point in it. Since the codes is executing under an rcu read section this may also cause rcu stalls, which in turn block synchronize_rcu operations, which more or less de-stabilises the whole system. Fix this by introducing a cond_resched() at the beginning of the loop. So this patch fixes the following: NMI watchdog: BUG: soft lockup - CPU#10 stuck for 23s! [httpd:18119] CPU: 10 PID: 18119 Comm: httpd Tainted: G O 4.4.20-clouder2 #6 Hardware name: Supermicro X10DRi/X10DRi, BIOS 1.1 04/14/2015 task: ffff88348d695280 ti: ffff881c95550000 task.ti: ffff881c95550000 RIP: 0010:[] [] _raw_spin_lock+0x17/0x30 RSP: 0018:ffff881c95553e40 EFLAGS: 00000246 RAX: 0000000000000000 RBX: ffff883161b1eea8 RCX: 000000000000000d RDX: 0000000000000001 RSI: 000000000000000e RDI: ffff883161b1eea4 RBP: ffff881c95553ea0 R08: ffff881c95553e68 R09: ffff883fef376f88 R10: ffff881fffb58c20 R11: ffffea0072556600 R12: ffff883161b1eea0 R13: ffff88348d695280 R14: ffff883dec427000 R15: ffff8831621672a0 FS: 0000000000000000(0000) GS:ffff881fffb40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3b3723e020 CR3: 0000000001c0a000 CR4: 00000000001406e0 Call Trace: ? exit_sem+0x7c/0x280 do_exit+0x338/0xb40 do_group_exit+0x43/0xd0 SyS_exit_group+0x14/0x20 entry_SYSCALL_64_fastpath+0x16/0x6e Link: http://lkml.kernel.org/r/1475154992-6363-1-git-send-email-kernel@kyup.com Signed-off-by: Nikolay Borisov Cc: Herton R. Krzesinski Cc: Fabian Frederick Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/sem.c b/ipc/sem.c index 5e318c5..10b94bc 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2105,6 +2105,8 @@ void exit_sem(struct task_struct *tsk) struct list_head tasks; int semid, i; + cond_resched(); + rcu_read_lock(); un = list_entry_rcu(ulp->list_proc.next, struct sem_undo, list_proc); -- cgit v0.10.2 From 0549a3c02efb350776bc869685a361045efd3a29 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 11 Oct 2016 13:55:08 -0700 Subject: kdump, vmcoreinfo: report memory sections virtual addresses KASLR memory randomization can randomize the base of the physical memory mapping (PAGE_OFFSET), vmalloc (VMALLOC_START) and vmemmap (VMEMMAP_START). Adding these variables on VMCOREINFO so tools can easily identify the base of each memory section. Link: http://lkml.kernel.org/r/1471531632-23003-1-git-send-email-thgarnie@google.com Signed-off-by: Thomas Garnier Acked-by: Baoquan He Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Eric Biederman Cc: Xunlei Pang Cc: HATAYAMA Daisuke Cc: Kees Cook Cc: Eugene Surovegin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5a294e4..8c1f218 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -337,6 +337,9 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); + VMCOREINFO_VMALLOC_START(VMALLOC_START); + VMCOREINFO_VMEMMAP_START(VMEMMAP_START); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d743777..406c33d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -259,6 +259,12 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +#define VMCOREINFO_PAGE_OFFSET(value) \ + vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value) +#define VMCOREINFO_VMALLOC_START(value) \ + vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value) +#define VMCOREINFO_VMEMMAP_START(value) \ + vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; -- cgit v0.10.2 From 9099daed9c6991a512c1f74b92ec49daf9408cda Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 11 Oct 2016 13:55:11 -0700 Subject: mm: kmemleak: avoid using __va() on addresses that don't have a lowmem mapping Some of the kmemleak_*() callbacks in memblock, bootmem, CMA convert a physical address to a virtual one using __va(). However, such physical addresses may sometimes be located in highmem and using __va() is incorrect, leading to inconsistent object tracking in kmemleak. The following functions have been added to the kmemleak API and they take a physical address as the object pointer. They only perform the corresponding action if the address has a lowmem mapping: kmemleak_alloc_phys kmemleak_free_part_phys kmemleak_not_leak_phys kmemleak_ignore_phys The affected calling places have been updated to use the new kmemleak API. Link: http://lkml.kernel.org/r/1471531432-16503-1-git-send-email-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reported-by: Vignesh R Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 1788722..b2391b8 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -162,6 +162,15 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_alloc_recursive`` - as kmemleak_alloc but checks the recursiveness - ``kmemleak_free_recursive`` - as kmemleak_free but checks the recursiveness +The following functions take a physical address as the object pointer +and only perform the corresponding action if the address has a lowmem +mapping: + +- ``kmemleak_alloc_phys`` +- ``kmemleak_free_part_phys`` +- ``kmemleak_not_leak_phys`` +- ``kmemleak_ignore_phys`` + Dealing with false positives/negatives -------------------------------------- diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 4894c68..1c2a328 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -38,6 +38,11 @@ extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, + gfp_t gfp) __ref; +extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; +extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; +extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, int min_count, unsigned long flags, @@ -106,6 +111,19 @@ static inline void kmemleak_erase(void **ptr) static inline void kmemleak_no_scan(const void *ptr) { } +static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, + int min_count, gfp_t gfp) +{ +} +static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) +{ +} +static inline void kmemleak_not_leak_phys(phys_addr_t phys) +{ +} +static inline void kmemleak_ignore_phys(phys_addr_t phys) +{ +} #endif /* CONFIG_DEBUG_KMEMLEAK */ diff --git a/mm/bootmem.c b/mm/bootmem.c index a869f84..e8a55a3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -155,7 +155,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; - kmemleak_free_part(__va(physaddr), size); + kmemleak_free_part_phys(physaddr, size); cursor = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -399,7 +399,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { unsigned long start, end; - kmemleak_free_part(__va(physaddr), size); + kmemleak_free_part_phys(physaddr, size); start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -420,7 +420,7 @@ void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; - kmemleak_free_part(__va(physaddr), size); + kmemleak_free_part_phys(physaddr, size); start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); diff --git a/mm/cma.c b/mm/cma.c index bd0e141..384c2cb 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -336,7 +336,7 @@ int __init cma_declare_contiguous(phys_addr_t base, * kmemleak scans/reads tracked objects for pointers to other * objects but this address isn't mapped and accessible */ - kmemleak_ignore(phys_to_virt(addr)); + kmemleak_ignore_phys(addr); base = addr; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 086292f..a5e453c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -90,6 +90,8 @@ #include #include #include +#include +#include #include #include #include @@ -1121,6 +1123,51 @@ void __ref kmemleak_no_scan(const void *ptr) } EXPORT_SYMBOL(kmemleak_no_scan); +/** + * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical + * address argument + */ +void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, + gfp_t gfp) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_alloc(__va(phys), size, min_count, gfp); +} +EXPORT_SYMBOL(kmemleak_alloc_phys); + +/** + * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a + * physical address argument + */ +void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_free_part(__va(phys), size); +} +EXPORT_SYMBOL(kmemleak_free_part_phys); + +/** + * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical + * address argument + */ +void __ref kmemleak_not_leak_phys(phys_addr_t phys) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_not_leak(__va(phys)); +} +EXPORT_SYMBOL(kmemleak_not_leak_phys); + +/** + * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical + * address argument + */ +void __ref kmemleak_ignore_phys(phys_addr_t phys) +{ + if (!IS_ENABLED(CONFIG_HIGHMEM) || PHYS_PFN(phys) < max_low_pfn) + kmemleak_ignore(__va(phys)); +} +EXPORT_SYMBOL(kmemleak_ignore_phys); + /* * Update an object's checksum and return true if it was modified. */ diff --git a/mm/memblock.c b/mm/memblock.c index c8dfa43..7608bc3 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -723,7 +723,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); - kmemleak_free_part(__va(base), size); + kmemleak_free_part_phys(base, size); return memblock_remove_range(&memblock.reserved, base, size); } @@ -1152,7 +1152,7 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, * The min_count is set to 0 so that memblock allocations are * never reported as leaks. */ - kmemleak_alloc(__va(found), size, 0, 0); + kmemleak_alloc_phys(found, size, 0, 0); return found; } return 0; @@ -1399,7 +1399,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", __func__, (u64)base, (u64)base + size - 1, (void *)_RET_IP_); - kmemleak_free_part(__va(base), size); + kmemleak_free_part_phys(base, size); memblock_remove_range(&memblock.reserved, base, size); } @@ -1419,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", __func__, (u64)base, (u64)base + size - 1, (void *)_RET_IP_); - kmemleak_free_part(__va(base), size); + kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); end = PFN_DOWN(base + size); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index ba609b6..487dad6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -84,7 +84,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) { unsigned long cursor, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part_phys(addr, size); cursor = PFN_UP(addr); end = PFN_DOWN(addr + size); -- cgit v0.10.2 From d0c75f33f065bf58833c43ac7b5ccfcb60131510 Mon Sep 17 00:00:00 2001 From: Mathieu Maret Date: Tue, 11 Oct 2016 13:55:14 -0700 Subject: scripts/tags.sh: enable code completion in VIM Vim, with the omnicppcomplete(#1) plugin, can do code completion using information build by ctags. Add flags needed by omnicppcomplete(#2) to have completion on member of structure. 1: https://github.com/vim-scripts/omnicppcomplete 2: https://github.com/vim-scripts/OmniCppComplete/blob/master/doc/omnicppcomplete.txt#L93 Link: http://lkml.kernel.org/r/20160830191546.4469-1-mathieu.maret@gmail.com Signed-off-by: Mathieu Maret Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/tags.sh b/scripts/tags.sh index b3775a9..a2ff338 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -263,7 +263,8 @@ exuberant() -I EXPORT_SYMBOL,EXPORT_SYMBOL_GPL,ACPI_EXPORT_SYMBOL \ -I DEFINE_TRACE,EXPORT_TRACEPOINT_SYMBOL,EXPORT_TRACEPOINT_SYMBOL_GPL \ -I static,const \ - --extra=+f --c-kinds=+px --langmap=c:+.h "${regex[@]}" + --extra=+fq --c-kinds=+px --fields=+iaS --langmap=c:+.h \ + "${regex[@]}" setup_regex exuberant kconfig all_kconfigs | xargs $1 -a \ -- cgit v0.10.2 From e700591ae03896c16974d4e1ab58eb296aaa5f59 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:17 -0700 Subject: kthread: rename probe_kthread_data() to kthread_probe_data() Patch series "kthread: Kthread worker API improvements" The intention of this patchset is to make it easier to manipulate and maintain kthreads. Especially, I want to replace all the custom main cycles with a generic one. Also I want to make the kthreads sleep in a consistent state in a common place when there is no work. This patch (of 11): A good practice is to prefix the names of functions by the name of the subsystem. This patch fixes the name of probe_kthread_data(). The other wrong functions names are part of the kthread worker API and will be fixed separately. Link: http://lkml.kernel.org/r/1470754545-17632-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Suggested-by: Andrew Morton Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index e691b6a..c792ee1 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -44,7 +44,7 @@ bool kthread_should_stop(void); bool kthread_should_park(void); bool kthread_freezable_should_stop(bool *was_frozen); void *kthread_data(struct task_struct *k); -void *probe_kthread_data(struct task_struct *k); +void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ab4c37..7e77b72 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -138,7 +138,7 @@ void *kthread_data(struct task_struct *task) } /** - * probe_kthread_data - speculative version of kthread_data() + * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it @@ -146,7 +146,7 @@ void *kthread_data(struct task_struct *task) * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ -void *probe_kthread_data(struct task_struct *task) +void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = to_kthread(task); void *data = NULL; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd81f03..479d840 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4261,7 +4261,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) * This function is called without any synchronization and @task * could be in any state. Be careful with dereferences. */ - worker = probe_kthread_data(task); + worker = kthread_probe_data(task); /* * Carefully copy the associated workqueue's workfn and name. Keep -- cgit v0.10.2 From 3989144f863ac576e6efba298d24b0b02a10d4bb Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:20 -0700 Subject: kthread: kthread worker API cleanup A good practice is to prefix the names of functions by the name of the subsystem. The kthread worker API is a mix of classic kthreads and workqueues. Each worker has a dedicated kthread. It runs a generic function that process queued works. It is implemented as part of the kthread subsystem. This patch renames the existing kthread worker API to use the corresponding name from the workqueues API prefixed by kthread_: __init_kthread_worker() -> __kthread_init_worker() init_kthread_worker() -> kthread_init_worker() init_kthread_work() -> kthread_init_work() insert_kthread_work() -> kthread_insert_work() queue_kthread_work() -> kthread_queue_work() flush_kthread_work() -> kthread_flush_work() flush_kthread_worker() -> kthread_flush_worker() Note that the names of DEFINE_KTHREAD_WORK*() macros stay as they are. It is common that the "DEFINE_" prefix has precedence over the subsystem names. Note that INIT() macros and init() functions use different naming scheme. There is no good solution. There are several reasons for this solution: + "init" in the function names stands for the verb "initialize" aka "initialize worker". While "INIT" in the macro names stands for the noun "INITIALIZER" aka "worker initializer". + INIT() macros are used only in DEFINE() macros + init() functions are used close to the other kthread() functions. It looks much better if all the functions use the same scheme. + There will be also kthread_destroy_worker() that will be used close to kthread_cancel_work(). It is related to the init() function. Again it looks better if all functions use the same naming scheme. + there are several precedents for such init() function names, e.g. amd_iommu_init_device(), free_area_init_node(), jump_label_init_type(), regmap_init_mmio_clk(), + It is not an argument but it was inconsistent even before. [arnd@arndb.de: fix linux-next merge conflict] Link: http://lkml.kernel.org/r/20160908135724.1311726-1-arnd@arndb.de Link: http://lkml.kernel.org/r/1470754545-17632-3-git-send-email-pmladek@suse.com Suggested-by: Andrew Morton Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt index bf90611..238e9f6 100644 --- a/Documentation/RCU/lockdep-splat.txt +++ b/Documentation/RCU/lockdep-splat.txt @@ -57,7 +57,7 @@ Call Trace: [] kernel_thread_helper+0x4/0x10 [] ? finish_task_switch+0x80/0x110 [] ? retint_restore_args+0xe/0xe - [] ? __init_kthread_worker+0x70/0x70 + [] ? __kthread_init_worker+0x70/0x70 [] ? gs_change+0xb/0xb Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows: diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 5fb6c62..16a7134 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - queue_kthread_work(&pit->worker, &pit->expired); + kthread_queue_work(&pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -233,7 +233,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_pit *pit) { hrtimer_cancel(&pit->pit_state.timer); - flush_kthread_work(&pit->expired); + kthread_flush_work(&pit->expired); } static void pit_do_work(struct kthread_work *work) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - queue_kthread_work(&pt->worker, &pt->expired); + kthread_queue_work(&pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -324,7 +324,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&ps->timer); - flush_kthread_work(&pit->expired); + kthread_flush_work(&pit->expired); ps->period = interval; ps->is_periodic = is_period; @@ -667,13 +667,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - init_kthread_worker(&pit->worker); + kthread_init_worker(&pit->worker); pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker_task)) goto fail_kthread; - init_kthread_work(&pit->expired, pit_do_work); + kthread_init_work(&pit->expired, pit_do_work); pit->kvm = kvm; @@ -730,7 +730,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - flush_kthread_work(&pit->expired); + kthread_flush_work(&pit->expired); kthread_stop(pit->worker_task); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index bfb92ac..6989ba0 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -47,7 +47,7 @@ static void crypto_pump_requests(struct crypto_engine *engine, /* If another context is idling then defer */ if (engine->idling) { - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); goto out; } @@ -58,7 +58,7 @@ static void crypto_pump_requests(struct crypto_engine *engine, /* Only do teardown in the thread */ if (!in_kthread) { - queue_kthread_work(&engine->kworker, + kthread_queue_work(&engine->kworker, &engine->pump_requests); goto out; } @@ -189,7 +189,7 @@ int crypto_transfer_cipher_request(struct crypto_engine *engine, ret = ablkcipher_enqueue_request(&engine->queue, req); if (!engine->busy && need_pump) - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); spin_unlock_irqrestore(&engine->queue_lock, flags); return ret; @@ -231,7 +231,7 @@ int crypto_transfer_hash_request(struct crypto_engine *engine, ret = ahash_enqueue_request(&engine->queue, req); if (!engine->busy && need_pump) - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); spin_unlock_irqrestore(&engine->queue_lock, flags); return ret; @@ -284,7 +284,7 @@ void crypto_finalize_cipher_request(struct crypto_engine *engine, req->base.complete(&req->base, err); - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); } EXPORT_SYMBOL_GPL(crypto_finalize_cipher_request); @@ -321,7 +321,7 @@ void crypto_finalize_hash_request(struct crypto_engine *engine, req->base.complete(&req->base, err); - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); } EXPORT_SYMBOL_GPL(crypto_finalize_hash_request); @@ -345,7 +345,7 @@ int crypto_engine_start(struct crypto_engine *engine) engine->running = true; spin_unlock_irqrestore(&engine->queue_lock, flags); - queue_kthread_work(&engine->kworker, &engine->pump_requests); + kthread_queue_work(&engine->kworker, &engine->pump_requests); return 0; } @@ -422,7 +422,7 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt) crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN); spin_lock_init(&engine->queue_lock); - init_kthread_worker(&engine->kworker); + kthread_init_worker(&engine->kworker); engine->kworker_task = kthread_run(kthread_worker_fn, &engine->kworker, "%s", engine->name); @@ -430,7 +430,7 @@ struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt) dev_err(dev, "failed to create crypto request pump task\n"); return NULL; } - init_kthread_work(&engine->pump_requests, crypto_pump_work); + kthread_init_work(&engine->pump_requests, crypto_pump_work); if (engine->rt) { dev_info(dev, "will run requests pump with realtime priority\n"); @@ -455,7 +455,7 @@ int crypto_engine_exit(struct crypto_engine *engine) if (ret) return ret; - flush_kthread_worker(&engine->kworker); + kthread_flush_worker(&engine->kworker); kthread_stop(engine->kworker_task); return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cbdb3b1..fa1b7a9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -840,13 +840,13 @@ static void loop_config_discard(struct loop_device *lo) static void loop_unprepare_queue(struct loop_device *lo) { - flush_kthread_worker(&lo->worker); + kthread_flush_worker(&lo->worker); kthread_stop(lo->worker_task); } static int loop_prepare_queue(struct loop_device *lo) { - init_kthread_worker(&lo->worker); + kthread_init_worker(&lo->worker); lo->worker_task = kthread_run(kthread_worker_fn, &lo->worker, "loop%d", lo->lo_number); if (IS_ERR(lo->worker_task)) @@ -1658,7 +1658,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } - queue_kthread_work(&lo->worker, &cmd->work); + kthread_queue_work(&lo->worker, &cmd->work); return BLK_MQ_RQ_QUEUE_OK; } @@ -1696,7 +1696,7 @@ static int loop_init_request(void *data, struct request *rq, struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->rq = rq; - init_kthread_work(&cmd->work, loop_queue_work); + kthread_init_work(&cmd->work, loop_queue_work); return 0; } diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index f2f229e..6d9904a 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -129,7 +129,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) if (likely(worker)) { cq->notify = RVT_CQ_NONE; cq->triggered++; - queue_kthread_work(worker, &cq->comptask); + kthread_queue_work(worker, &cq->comptask); } } @@ -265,7 +265,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, cq->ibcq.cqe = entries; cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); - init_kthread_work(&cq->comptask, send_complete); + kthread_init_work(&cq->comptask, send_complete); cq->queue = wc; ret = &cq->ibcq; @@ -295,7 +295,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq) struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; - flush_kthread_work(&cq->comptask); + kthread_flush_work(&cq->comptask); spin_lock(&rdi->n_cqs_lock); rdi->n_cqs_allocated--; spin_unlock(&rdi->n_cqs_lock); @@ -514,7 +514,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL); if (!rdi->worker) return -ENOMEM; - init_kthread_worker(rdi->worker); + kthread_init_worker(rdi->worker); task = kthread_create_on_node( kthread_worker_fn, rdi->worker, @@ -547,7 +547,7 @@ void rvt_cq_exit(struct rvt_dev_info *rdi) /* blocks future queuing from send_complete() */ rdi->worker = NULL; smp_wmb(); /* See rdi_cq_enter */ - flush_kthread_worker(worker); + kthread_flush_worker(worker); kthread_stop(worker->task); kfree(worker); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5eacce1..dc75bea 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -581,7 +581,7 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, if (!md->init_tio_pdu) memset(&tio->info, 0, sizeof(tio->info)); if (md->kworker_task) - init_kthread_work(&tio->work, map_tio_request); + kthread_init_work(&tio->work, map_tio_request); } static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, @@ -831,7 +831,7 @@ static void dm_old_request_fn(struct request_queue *q) tio = tio_from_request(rq); /* Establish tio->ti before queuing work (map_tio_request) */ tio->ti = ti; - queue_kthread_work(&md->kworker, &tio->work); + kthread_queue_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); } } @@ -853,7 +853,7 @@ int dm_old_init_request_queue(struct mapped_device *md) blk_queue_prep_rq(md->queue, dm_old_prep_fn); /* Initialize the request-based DM worker thread */ - init_kthread_worker(&md->kworker); + kthread_init_worker(&md->kworker); md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, "kdmwork-%s", dm_device_name(md)); if (IS_ERR(md->kworker_task)) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index be35258..147af95 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1891,7 +1891,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) spin_unlock_irq(q->queue_lock); if (dm_request_based(md) && md->kworker_task) - flush_kthread_worker(&md->kworker); + kthread_flush_worker(&md->kworker); /* * Take suspend_lock so that presuspend and postsuspend methods @@ -2147,7 +2147,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, if (dm_request_based(md)) { dm_stop_queue(md->queue); if (md->kworker_task) - flush_kthread_worker(&md->kworker); + kthread_flush_worker(&md->kworker); } flush_workqueue(md->wq); diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c index 374033a..ee48c3e 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.c +++ b/drivers/media/pci/ivtv/ivtv-driver.c @@ -750,7 +750,7 @@ static int ivtv_init_struct1(struct ivtv *itv) spin_lock_init(&itv->lock); spin_lock_init(&itv->dma_reg_lock); - init_kthread_worker(&itv->irq_worker); + kthread_init_worker(&itv->irq_worker); itv->irq_worker_task = kthread_run(kthread_worker_fn, &itv->irq_worker, "%s", itv->v4l2_dev.name); if (IS_ERR(itv->irq_worker_task)) { @@ -760,7 +760,7 @@ static int ivtv_init_struct1(struct ivtv *itv) /* must use the FIFO scheduler as it is realtime sensitive */ sched_setscheduler(itv->irq_worker_task, SCHED_FIFO, ¶m); - init_kthread_work(&itv->irq_work, ivtv_irq_work_handler); + kthread_init_work(&itv->irq_work, ivtv_irq_work_handler); /* Initial settings */ itv->cxhdl.port = CX2341X_PORT_MEMORY; @@ -1441,7 +1441,7 @@ static void ivtv_remove(struct pci_dev *pdev) del_timer_sync(&itv->dma_timer); /* Kill irq worker */ - flush_kthread_worker(&itv->irq_worker); + kthread_flush_worker(&itv->irq_worker); kthread_stop(itv->irq_worker_task); ivtv_streams_cleanup(itv); diff --git a/drivers/media/pci/ivtv/ivtv-irq.c b/drivers/media/pci/ivtv/ivtv-irq.c index 36ca2d6..6efe1f7 100644 --- a/drivers/media/pci/ivtv/ivtv-irq.c +++ b/drivers/media/pci/ivtv/ivtv-irq.c @@ -1062,7 +1062,7 @@ irqreturn_t ivtv_irq_handler(int irq, void *dev_id) } if (test_and_clear_bit(IVTV_F_I_HAVE_WORK, &itv->i_flags)) { - queue_kthread_work(&itv->irq_worker, &itv->irq_work); + kthread_queue_work(&itv->irq_worker, &itv->irq_work); } spin_unlock(&itv->dma_reg_lock); diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index 42e3407..b14f030 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -821,7 +821,7 @@ static void encx24j600_set_multicast_list(struct net_device *dev) } if (oldfilter != priv->rxfilter) - queue_kthread_work(&priv->kworker, &priv->setrx_work); + kthread_queue_work(&priv->kworker, &priv->setrx_work); } static void encx24j600_hw_tx(struct encx24j600_priv *priv) @@ -879,7 +879,7 @@ static netdev_tx_t encx24j600_tx(struct sk_buff *skb, struct net_device *dev) /* Remember the skb for deferred processing */ priv->tx_skb = skb; - queue_kthread_work(&priv->kworker, &priv->tx_work); + kthread_queue_work(&priv->kworker, &priv->tx_work); return NETDEV_TX_OK; } @@ -1037,9 +1037,9 @@ static int encx24j600_spi_probe(struct spi_device *spi) goto out_free; } - init_kthread_worker(&priv->kworker); - init_kthread_work(&priv->tx_work, encx24j600_tx_proc); - init_kthread_work(&priv->setrx_work, encx24j600_setrx_proc); + kthread_init_worker(&priv->kworker); + kthread_init_work(&priv->tx_work, encx24j600_tx_proc); + kthread_init_work(&priv->setrx_work, encx24j600_setrx_proc); priv->kworker_task = kthread_run(kthread_worker_fn, &priv->kworker, "encx24j600"); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8146ccd..5787b72 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1112,7 +1112,7 @@ static void __spi_pump_messages(struct spi_master *master, bool in_kthread) /* If another context is idling the device then defer */ if (master->idling) { - queue_kthread_work(&master->kworker, &master->pump_messages); + kthread_queue_work(&master->kworker, &master->pump_messages); spin_unlock_irqrestore(&master->queue_lock, flags); return; } @@ -1126,7 +1126,7 @@ static void __spi_pump_messages(struct spi_master *master, bool in_kthread) /* Only do teardown in the thread */ if (!in_kthread) { - queue_kthread_work(&master->kworker, + kthread_queue_work(&master->kworker, &master->pump_messages); spin_unlock_irqrestore(&master->queue_lock, flags); return; @@ -1250,7 +1250,7 @@ static int spi_init_queue(struct spi_master *master) master->running = false; master->busy = false; - init_kthread_worker(&master->kworker); + kthread_init_worker(&master->kworker); master->kworker_task = kthread_run(kthread_worker_fn, &master->kworker, "%s", dev_name(&master->dev)); @@ -1258,7 +1258,7 @@ static int spi_init_queue(struct spi_master *master) dev_err(&master->dev, "failed to create message pump task\n"); return PTR_ERR(master->kworker_task); } - init_kthread_work(&master->pump_messages, spi_pump_messages); + kthread_init_work(&master->pump_messages, spi_pump_messages); /* * Master config will indicate if this controller should run the @@ -1331,7 +1331,7 @@ void spi_finalize_current_message(struct spi_master *master) spin_lock_irqsave(&master->queue_lock, flags); master->cur_msg = NULL; master->cur_msg_prepared = false; - queue_kthread_work(&master->kworker, &master->pump_messages); + kthread_queue_work(&master->kworker, &master->pump_messages); spin_unlock_irqrestore(&master->queue_lock, flags); trace_spi_message_done(mesg); @@ -1357,7 +1357,7 @@ static int spi_start_queue(struct spi_master *master) master->cur_msg = NULL; spin_unlock_irqrestore(&master->queue_lock, flags); - queue_kthread_work(&master->kworker, &master->pump_messages); + kthread_queue_work(&master->kworker, &master->pump_messages); return 0; } @@ -1404,7 +1404,7 @@ static int spi_destroy_queue(struct spi_master *master) ret = spi_stop_queue(master); /* - * flush_kthread_worker will block until all work is done. + * kthread_flush_worker will block until all work is done. * If the reason that stop_queue timed out is that the work will never * finish, then it does no good to call flush/stop thread, so * return anyway. @@ -1414,7 +1414,7 @@ static int spi_destroy_queue(struct spi_master *master) return ret; } - flush_kthread_worker(&master->kworker); + kthread_flush_worker(&master->kworker); kthread_stop(master->kworker_task); return 0; @@ -1438,7 +1438,7 @@ static int __spi_queued_transfer(struct spi_device *spi, list_add_tail(&msg->queue, &master->queue); if (!master->busy && need_pump) - queue_kthread_work(&master->kworker, &master->pump_messages); + kthread_queue_work(&master->kworker, &master->pump_messages); spin_unlock_irqrestore(&master->queue_lock, flags); return 0; diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index a9d94f7..2675792 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -708,7 +708,7 @@ static irqreturn_t sc16is7xx_irq(int irq, void *dev_id) { struct sc16is7xx_port *s = (struct sc16is7xx_port *)dev_id; - queue_kthread_work(&s->kworker, &s->irq_work); + kthread_queue_work(&s->kworker, &s->irq_work); return IRQ_HANDLED; } @@ -784,7 +784,7 @@ static void sc16is7xx_ier_clear(struct uart_port *port, u8 bit) one->config.flags |= SC16IS7XX_RECONF_IER; one->config.ier_clear |= bit; - queue_kthread_work(&s->kworker, &one->reg_work); + kthread_queue_work(&s->kworker, &one->reg_work); } static void sc16is7xx_stop_tx(struct uart_port *port) @@ -802,7 +802,7 @@ static void sc16is7xx_start_tx(struct uart_port *port) struct sc16is7xx_port *s = dev_get_drvdata(port->dev); struct sc16is7xx_one *one = to_sc16is7xx_one(port, port); - queue_kthread_work(&s->kworker, &one->tx_work); + kthread_queue_work(&s->kworker, &one->tx_work); } static unsigned int sc16is7xx_tx_empty(struct uart_port *port) @@ -828,7 +828,7 @@ static void sc16is7xx_set_mctrl(struct uart_port *port, unsigned int mctrl) struct sc16is7xx_one *one = to_sc16is7xx_one(port, port); one->config.flags |= SC16IS7XX_RECONF_MD; - queue_kthread_work(&s->kworker, &one->reg_work); + kthread_queue_work(&s->kworker, &one->reg_work); } static void sc16is7xx_break_ctl(struct uart_port *port, int break_state) @@ -957,7 +957,7 @@ static int sc16is7xx_config_rs485(struct uart_port *port, port->rs485 = *rs485; one->config.flags |= SC16IS7XX_RECONF_RS485; - queue_kthread_work(&s->kworker, &one->reg_work); + kthread_queue_work(&s->kworker, &one->reg_work); return 0; } @@ -1030,7 +1030,7 @@ static void sc16is7xx_shutdown(struct uart_port *port) sc16is7xx_power(port, 0); - flush_kthread_worker(&s->kworker); + kthread_flush_worker(&s->kworker); } static const char *sc16is7xx_type(struct uart_port *port) @@ -1176,8 +1176,8 @@ static int sc16is7xx_probe(struct device *dev, s->devtype = devtype; dev_set_drvdata(dev, s); - init_kthread_worker(&s->kworker); - init_kthread_work(&s->irq_work, sc16is7xx_ist); + kthread_init_worker(&s->kworker); + kthread_init_work(&s->irq_work, sc16is7xx_ist); s->kworker_task = kthread_run(kthread_worker_fn, &s->kworker, "sc16is7xx"); if (IS_ERR(s->kworker_task)) { @@ -1234,8 +1234,8 @@ static int sc16is7xx_probe(struct device *dev, SC16IS7XX_EFCR_RXDISABLE_BIT | SC16IS7XX_EFCR_TXDISABLE_BIT); /* Initialize kthread work structs */ - init_kthread_work(&s->p[i].tx_work, sc16is7xx_tx_proc); - init_kthread_work(&s->p[i].reg_work, sc16is7xx_reg_proc); + kthread_init_work(&s->p[i].tx_work, sc16is7xx_tx_proc); + kthread_init_work(&s->p[i].reg_work, sc16is7xx_reg_proc); /* Register port */ uart_add_one_port(&sc16is7xx_uart, &s->p[i].port); @@ -1301,7 +1301,7 @@ static int sc16is7xx_remove(struct device *dev) sc16is7xx_power(&s->p[i].port, 0); } - flush_kthread_worker(&s->kworker); + kthread_flush_worker(&s->kworker); kthread_stop(s->kworker_task); if (!IS_ERR(s->clk)) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c792ee1..e2b095b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -57,7 +57,7 @@ extern int tsk_fork_get_node(struct task_struct *tsk); * Simple work processor based on kthread. * * This provides easier way to make use of kthreads. A kthread_work - * can be queued and flushed using queue/flush_kthread_work() + * can be queued and flushed using queue/kthread_flush_work() * respectively. Queued kthread_works are processed by a kthread * running kthread_worker_fn(). */ @@ -99,23 +99,23 @@ struct kthread_work { */ #ifdef CONFIG_LOCKDEP # define KTHREAD_WORKER_INIT_ONSTACK(worker) \ - ({ init_kthread_worker(&worker); worker; }) + ({ kthread_init_worker(&worker); worker; }) # define DEFINE_KTHREAD_WORKER_ONSTACK(worker) \ struct kthread_worker worker = KTHREAD_WORKER_INIT_ONSTACK(worker) #else # define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker) #endif -extern void __init_kthread_worker(struct kthread_worker *worker, +extern void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key); -#define init_kthread_worker(worker) \ +#define kthread_init_worker(worker) \ do { \ static struct lock_class_key __key; \ - __init_kthread_worker((worker), "("#worker")->lock", &__key); \ + __kthread_init_worker((worker), "("#worker")->lock", &__key); \ } while (0) -#define init_kthread_work(work, fn) \ +#define kthread_init_work(work, fn) \ do { \ memset((work), 0, sizeof(struct kthread_work)); \ INIT_LIST_HEAD(&(work)->node); \ @@ -124,9 +124,9 @@ extern void __init_kthread_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); -bool queue_kthread_work(struct kthread_worker *worker, +bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); -void flush_kthread_work(struct kthread_work *work); -void flush_kthread_worker(struct kthread_worker *worker); +void kthread_flush_work(struct kthread_work *work); +void kthread_flush_worker(struct kthread_worker *worker); #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 7e77b72..c52a05a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -540,7 +540,7 @@ int kthreadd(void *unused) return 0; } -void __init_kthread_worker(struct kthread_worker *worker, +void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { @@ -549,7 +549,7 @@ void __init_kthread_worker(struct kthread_worker *worker, INIT_LIST_HEAD(&worker->work_list); worker->task = NULL; } -EXPORT_SYMBOL_GPL(__init_kthread_worker); +EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker @@ -606,7 +606,7 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); /* insert @work before @pos in @worker */ -static void insert_kthread_work(struct kthread_worker *worker, +static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { @@ -619,7 +619,7 @@ static void insert_kthread_work(struct kthread_worker *worker, } /** - * queue_kthread_work - queue a kthread_work + * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * @@ -627,7 +627,7 @@ static void insert_kthread_work(struct kthread_worker *worker, * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. */ -bool queue_kthread_work(struct kthread_worker *worker, +bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; @@ -635,13 +635,13 @@ bool queue_kthread_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); if (list_empty(&work->node)) { - insert_kthread_work(worker, work, &worker->work_list); + kthread_insert_work(worker, work, &worker->work_list); ret = true; } spin_unlock_irqrestore(&worker->lock, flags); return ret; } -EXPORT_SYMBOL_GPL(queue_kthread_work); +EXPORT_SYMBOL_GPL(kthread_queue_work); struct kthread_flush_work { struct kthread_work work; @@ -656,12 +656,12 @@ static void kthread_flush_work_fn(struct kthread_work *work) } /** - * flush_kthread_work - flush a kthread_work + * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ -void flush_kthread_work(struct kthread_work *work) +void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), @@ -682,9 +682,10 @@ retry: } if (!list_empty(&work->node)) - insert_kthread_work(worker, &fwork.work, work->node.next); + kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) - insert_kthread_work(worker, &fwork.work, worker->work_list.next); + kthread_insert_work(worker, &fwork.work, + worker->work_list.next); else noop = true; @@ -693,23 +694,23 @@ retry: if (!noop) wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_work); +EXPORT_SYMBOL_GPL(kthread_flush_work); /** - * flush_kthread_worker - flush all current works on a kthread_worker + * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ -void flush_kthread_worker(struct kthread_worker *worker) +void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; - queue_kthread_work(worker, &fwork.work); + kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } -EXPORT_SYMBOL_GPL(flush_kthread_worker); +EXPORT_SYMBOL_GPL(kthread_flush_worker); diff --git a/sound/soc/intel/baytrail/sst-baytrail-ipc.c b/sound/soc/intel/baytrail/sst-baytrail-ipc.c index c8455b4..7ab14ce 100644 --- a/sound/soc/intel/baytrail/sst-baytrail-ipc.c +++ b/sound/soc/intel/baytrail/sst-baytrail-ipc.c @@ -338,7 +338,7 @@ static irqreturn_t sst_byt_irq_thread(int irq, void *context) spin_unlock_irqrestore(&sst->spinlock, flags); /* continue to send any remaining messages... */ - queue_kthread_work(&ipc->kworker, &ipc->kwork); + kthread_queue_work(&ipc->kworker, &ipc->kwork); return IRQ_HANDLED; } diff --git a/sound/soc/intel/common/sst-ipc.c b/sound/soc/intel/common/sst-ipc.c index a12c7bb..6c672ac 100644 --- a/sound/soc/intel/common/sst-ipc.c +++ b/sound/soc/intel/common/sst-ipc.c @@ -111,7 +111,7 @@ static int ipc_tx_message(struct sst_generic_ipc *ipc, u64 header, list_add_tail(&msg->list, &ipc->tx_list); spin_unlock_irqrestore(&ipc->dsp->spinlock, flags); - queue_kthread_work(&ipc->kworker, &ipc->kwork); + kthread_queue_work(&ipc->kworker, &ipc->kwork); if (wait) return tx_wait_done(ipc, msg, rx_data); @@ -281,7 +281,7 @@ int sst_ipc_init(struct sst_generic_ipc *ipc) return -ENOMEM; /* start the IPC message thread */ - init_kthread_worker(&ipc->kworker); + kthread_init_worker(&ipc->kworker); ipc->tx_thread = kthread_run(kthread_worker_fn, &ipc->kworker, "%s", dev_name(ipc->dev)); @@ -292,7 +292,7 @@ int sst_ipc_init(struct sst_generic_ipc *ipc) return ret; } - init_kthread_work(&ipc->kwork, ipc_tx_msgs); + kthread_init_work(&ipc->kwork, ipc_tx_msgs); return 0; } EXPORT_SYMBOL_GPL(sst_ipc_init); diff --git a/sound/soc/intel/haswell/sst-haswell-ipc.c b/sound/soc/intel/haswell/sst-haswell-ipc.c index 9156522..e432a31 100644 --- a/sound/soc/intel/haswell/sst-haswell-ipc.c +++ b/sound/soc/intel/haswell/sst-haswell-ipc.c @@ -818,7 +818,7 @@ static irqreturn_t hsw_irq_thread(int irq, void *context) spin_unlock_irqrestore(&sst->spinlock, flags); /* continue to send any remaining messages... */ - queue_kthread_work(&ipc->kworker, &ipc->kwork); + kthread_queue_work(&ipc->kworker, &ipc->kwork); return IRQ_HANDLED; } diff --git a/sound/soc/intel/skylake/skl-sst-ipc.c b/sound/soc/intel/skylake/skl-sst-ipc.c index 0bd01e6..797cf40 100644 --- a/sound/soc/intel/skylake/skl-sst-ipc.c +++ b/sound/soc/intel/skylake/skl-sst-ipc.c @@ -464,7 +464,7 @@ irqreturn_t skl_dsp_irq_thread_handler(int irq, void *context) skl_ipc_int_enable(dsp); /* continue to send any remaining messages... */ - queue_kthread_work(&ipc->kworker, &ipc->kwork); + kthread_queue_work(&ipc->kworker, &ipc->kwork); return IRQ_HANDLED; } -- cgit v0.10.2 From a65d40961dc70d15dec046c2ee88c556f57940b2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:23 -0700 Subject: kthread/smpboot: do not park in kthread_create_on_cpu() kthread_create_on_cpu() was added by the commit 2a1d446019f9a5983e ("kthread: Implement park/unpark facility"). It is currently used only when enabling new CPU. For this purpose, the newly created kthread has to be parked. The CPU binding is a bit tricky. The kthread is parked when the CPU has not been allowed yet. And the CPU is bound when the kthread is unparked. The function would be useful for more per-CPU kthreads, e.g. bnx2fc_thread, fcoethread. For this purpose, the newly created kthread should stay in the uninterruptible state. This patch moves the parking into smpboot. It binds the thread already when created. Then the function might be used universally. Also the behavior is consistent with kthread_create() and kthread_create_on_node(). Link: http://lkml.kernel.org/r/1470754545-17632-4-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Thomas Gleixner Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kthread.c b/kernel/kthread.c index c52a05a..f5dc0b1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -390,10 +390,10 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), cpu); if (IS_ERR(p)) return p; + kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); to_kthread(p)->cpu = cpu; - /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ - kthread_park(p); return p; } @@ -407,6 +407,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) * which might be about to be cleared. */ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + /* + * Newly created kthread was parked when the CPU was offline. + * The binding was lost and we need to set it again. + */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); wake_up_state(k, TASK_PARKED); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 13bc43d..4a5c6e7 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -186,6 +186,11 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } + /* + * Park the thread so that it could start right on the CPU + * when it is available. + */ + kthread_park(tsk); get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; if (ht->create) { -- cgit v0.10.2 From 255451e45345bcd19e79f25e9afc0b9e14ccb104 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:27 -0700 Subject: kthread: allow to call __kthread_create_on_node() with va_list args kthread_create_on_node() implements a bunch of logic to create the kthread. It is already called by kthread_create_on_cpu(). We are going to extend the kthread worker API and will need to call kthread_create_on_node() with va_list args there. This patch does only a refactoring and does not modify the existing behavior. Link: http://lkml.kernel.org/r/1470754545-17632-5-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kthread.c b/kernel/kthread.c index f5dc0b1..36fd751 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -244,33 +244,10 @@ static void create_kthread(struct kthread_create_info *create) } } -/** - * kthread_create_on_node - create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @node: task and thread structures for the thread are allocated on this node - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and - * is affine to all CPUs. - * - * If thread is going to be bound on a particular cpu, give its node - * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which no one will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number; it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). - */ -struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), - void *data, int node, - const char namefmt[], - ...) +static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; @@ -311,11 +288,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; - va_list args; - va_start(args, namefmt); vsnprintf(task->comm, sizeof(task->comm), namefmt, args); - va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. @@ -326,6 +300,44 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), kfree(create); return task; } + +/** + * kthread_create_on_node - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @node: task and thread structures for the thread are allocated on this node + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and + * is affine to all CPUs. + * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn() can either call do_exit() directly if it is a + * standalone thread for which no one will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). + */ +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, int node, + const char namefmt[], + ...) +{ + struct task_struct *task; + va_list args; + + va_start(args, namefmt); + task = __kthread_create_on_node(threadfn, data, node, namefmt, args); + va_end(args); + + return task; +} EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) -- cgit v0.10.2 From fbae2d44aa1df72d0154be77eb4d71e1e34c0f8f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:30 -0700 Subject: kthread: add kthread_create_worker*() Kthread workers are currently created using the classic kthread API, namely kthread_run(). kthread_worker_fn() is passed as the @threadfn parameter. This patch defines kthread_create_worker() and kthread_create_worker_on_cpu() functions that hide implementation details. They enforce using kthread_worker_fn() for the main thread. But I doubt that there are any plans to create any alternative. In fact, I think that we do not want any alternative main thread because it would be hard to support consistency with the rest of the kthread worker API. The naming and function of kthread_create_worker() is inspired by the workqueues API like the rest of the kthread worker API. The kthread_create_worker_on_cpu() variant is motivated by the original kthread_create_on_cpu(). Note that we need to bind per-CPU kthread workers already when they are created. It makes the life easier. kthread_bind() could not be used later for an already running worker. This patch does _not_ convert existing kthread workers. The kthread worker API need more improvements first, e.g. a function to destroy the worker. IMPORTANT: kthread_create_worker_on_cpu() allows to use any format of the worker name, in compare with kthread_create_on_cpu(). The good thing is that it is more generic. The bad thing is that most users will need to pass the cpu number in two parameters, e.g. kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). To be honest, the main motivation was to avoid the need for an empty va_list. The only legal way was to create a helper function that would be called with an empty list. Other attempts caused compilation warnings or even errors on different architectures. There were also other alternatives, for example, using #define or splitting __kthread_create_worker(). The used solution looked like the least ugly. Link: http://lkml.kernel.org/r/1470754545-17632-6-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index e2b095b..daeb2be 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -124,6 +124,13 @@ extern void __kthread_init_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); +__printf(1, 2) +struct kthread_worker * +kthread_create_worker(const char namefmt[], ...); + +struct kthread_worker * +kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...); + bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); void kthread_flush_work(struct kthread_work *work); diff --git a/kernel/kthread.c b/kernel/kthread.c index 36fd751..12695e6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -571,23 +571,24 @@ EXPORT_SYMBOL_GPL(__kthread_init_worker); * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * - * This function can be used as @threadfn to kthread_create() or - * kthread_run() with @worker_ptr argument pointing to an initialized - * kthread_worker. The started kthread will process work_list until - * the it is stopped with kthread_stop(). A kthread can also call - * this function directly after extra initialization. + * This function implements the main cycle of kthread worker. It processes + * work_list until it is stopped with kthread_stop(). It sleeps when the queue + * is empty. * - * Different kthreads can be used for the same kthread_worker as long - * as there's only one kthread attached to it at any given time. A - * kthread_worker without an attached kthread simply collects queued - * kthread_works. + * The works are not allowed to keep any locks, disable preemption or interrupts + * when they finish. There is defined a safe point for freezing when one work + * finishes and before a new one is started. */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; - WARN_ON(worker->task); + /* + * FIXME: Update the check and remove the assignment when all kthread + * worker users are created using kthread_create_worker*() functions. + */ + WARN_ON(worker->task && worker->task != current); worker->task = current; repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ @@ -621,6 +622,98 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +static struct kthread_worker * +__kthread_create_worker(int cpu, const char namefmt[], va_list args) +{ + struct kthread_worker *worker; + struct task_struct *task; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (!worker) + return ERR_PTR(-ENOMEM); + + kthread_init_worker(worker); + + if (cpu >= 0) { + char name[TASK_COMM_LEN]; + + /* + * kthread_create_worker_on_cpu() allows to pass a generic + * namefmt in compare with kthread_create_on_cpu. We need + * to format it here. + */ + vsnprintf(name, sizeof(name), namefmt, args); + task = kthread_create_on_cpu(kthread_worker_fn, worker, + cpu, name); + } else { + task = __kthread_create_on_node(kthread_worker_fn, worker, + -1, namefmt, args); + } + + if (IS_ERR(task)) + goto fail_task; + + worker->task = task; + wake_up_process(task); + return worker; + +fail_task: + kfree(worker); + return ERR_CAST(task); +} + +/** + * kthread_create_worker - create a kthread worker + * @namefmt: printf-style name for the kthread worker (task). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker(const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(-1, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker); + +/** + * kthread_create_worker_on_cpu - create a kthread worker and bind it + * it to a given CPU and the associated NUMA node. + * @cpu: CPU number + * @namefmt: printf-style name for the kthread worker (task). + * + * Use a valid CPU number if you want to bind the kthread worker + * to the given CPU and the associated NUMA node. + * + * A good practice is to add the cpu number also into the worker name. + * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). + * + * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * when the needed structures could not get allocated, and ERR_PTR(-EINTR) + * when the worker was SIGKILLed. + */ +struct kthread_worker * +kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker(cpu, namefmt, args); + va_end(args); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker_on_cpu); + /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, -- cgit v0.10.2 From 35033fe9cbbf18415dfeb7e27f0d4228dfc7458a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:33 -0700 Subject: kthread: add kthread_destroy_worker() The current kthread worker users call flush() and stop() explicitly. This function does the same plus it frees the kthread_worker struct in one call. It is supposed to be used together with kthread_create_worker*() that allocates struct kthread_worker. Link: http://lkml.kernel.org/r/1470754545-17632-7-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index daeb2be..afc8939 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -136,4 +136,6 @@ bool kthread_queue_work(struct kthread_worker *worker, void kthread_flush_work(struct kthread_work *work); void kthread_flush_worker(struct kthread_worker *worker); +void kthread_destroy_worker(struct kthread_worker *worker); + #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 12695e6..f874300 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -823,3 +823,26 @@ void kthread_flush_worker(struct kthread_worker *worker) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); + +/** + * kthread_destroy_worker - destroy a kthread worker + * @worker: worker to be destroyed + * + * Flush and destroy @worker. The simple flush is enough because the kthread + * worker API is used only in trivial scenarios. There are no multi-step state + * machines needed. + */ +void kthread_destroy_worker(struct kthread_worker *worker) +{ + struct task_struct *task; + + task = worker->task; + if (WARN_ON(!task)) + return; + + kthread_flush_worker(worker); + kthread_stop(task); + WARN_ON(!list_empty(&worker->work_list)); + kfree(worker); +} +EXPORT_SYMBOL(kthread_destroy_worker); -- cgit v0.10.2 From 8197b3d43b250b8214b9913878e9227eef8bd85f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:36 -0700 Subject: kthread: detect when a kthread work is used by more workers Nothing currently prevents a work from queuing for a kthread worker when it is already running on another one. This means that the work might run in parallel on more than one worker. Also some operations are not reliable, e.g. flush. This problem will be even more visible after we add kthread_cancel_work() function. It will only have "work" as the parameter and will use worker->lock to synchronize with others. Well, normally this is not a problem because the API users are sane. But bugs might happen and users also might be crazy. This patch adds a warning when we try to insert the work for another worker. It does not fully prevent the misuse because it would make the code much more complicated without a big benefit. It adds the same warning also into kthread_flush_work() instead of the repeated attempts to get the right lock. A side effect is that one needs to explicitly reinitialize the work if it must be queued into another worker. This is needed, for example, when the worker is stopped and started again. It is a bit inconvenient. But it looks like a good compromise between the stability and complexity. I have double checked all existing users of the kthread worker API and they all seems to initialize the work after the worker gets started. Just for completeness, the patch adds a check that the work is not already in a queue. The patch also puts all the checks into a separate function. It will be reused when implementing delayed works. Link: http://lkml.kernel.org/r/1470754545-17632-8-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Oleg Nesterov Cc: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kthread.c b/kernel/kthread.c index f874300..524dfc8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -578,6 +578,9 @@ EXPORT_SYMBOL_GPL(__kthread_init_worker); * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. + * + * Also the works must not be handled by more than one worker at the same time, + * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { @@ -714,12 +717,21 @@ kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) } EXPORT_SYMBOL(kthread_create_worker_on_cpu); +static void kthread_insert_work_sanity_check(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + WARN_ON_ONCE(!list_empty(&work->node)); + /* Do not use a work with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker && work->worker != worker); +} + /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, - struct kthread_work *work, - struct list_head *pos) + struct kthread_work *work, + struct list_head *pos) { - lockdep_assert_held(&worker->lock); + kthread_insert_work_sanity_check(worker, work); list_add_tail(&work->node, pos); work->worker = worker; @@ -735,6 +747,9 @@ static void kthread_insert_work(struct kthread_worker *worker, * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. + * + * Reinitialize the work if it needs to be used by another worker. + * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) @@ -779,16 +794,13 @@ void kthread_flush_work(struct kthread_work *work) struct kthread_worker *worker; bool noop = false; -retry: worker = work->worker; if (!worker) return; spin_lock_irq(&worker->lock); - if (work->worker != worker) { - spin_unlock_irq(&worker->lock); - goto retry; - } + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); -- cgit v0.10.2 From 22597dc3d97b1ead2aca201397415a1a84bf2b26 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:40 -0700 Subject: kthread: initial support for delayed kthread work We are going to use kthread_worker more widely and delayed works will be pretty useful. The implementation is inspired by workqueues. It uses a timer to queue the work after the requested delay. If the delay is zero, the work is queued immediately. In compare with workqueues, each work is associated with a single worker (kthread). Therefore the implementation could be much easier. In particular, we use the worker->lock to synchronize all the operations with the work. We do not need any atomic operation with a flags variable. In fact, we do not need any state variable at all. Instead, we add a list of delayed works into the worker. Then the pending work is listed either in the list of queued or delayed works. And the existing check of pending works is the same even for the delayed ones. A work must not be assigned to another worker unless reinitialized. Therefore the timer handler might expect that dwork->work->worker is valid and it could simply take the lock. We just add some sanity checks to help with debugging a potential misuse. Link: http://lkml.kernel.org/r/1470754545-17632-9-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index afc8939..4acde1a 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -63,10 +63,12 @@ extern int tsk_fork_get_node(struct task_struct *tsk); */ struct kthread_work; typedef void (*kthread_work_func_t)(struct kthread_work *work); +void kthread_delayed_work_timer_fn(unsigned long __data); struct kthread_worker { spinlock_t lock; struct list_head work_list; + struct list_head delayed_work_list; struct task_struct *task; struct kthread_work *current_work; }; @@ -77,9 +79,15 @@ struct kthread_work { struct kthread_worker *worker; }; +struct kthread_delayed_work { + struct kthread_work work; + struct timer_list timer; +}; + #define KTHREAD_WORKER_INIT(worker) { \ .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ .work_list = LIST_HEAD_INIT((worker).work_list), \ + .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ } #define KTHREAD_WORK_INIT(work, fn) { \ @@ -87,12 +95,23 @@ struct kthread_work { .func = (fn), \ } +#define KTHREAD_DELAYED_WORK_INIT(dwork, fn) { \ + .work = KTHREAD_WORK_INIT((dwork).work, (fn)), \ + .timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn, \ + 0, (unsigned long)&(dwork), \ + TIMER_IRQSAFE), \ + } + #define DEFINE_KTHREAD_WORKER(worker) \ struct kthread_worker worker = KTHREAD_WORKER_INIT(worker) #define DEFINE_KTHREAD_WORK(work, fn) \ struct kthread_work work = KTHREAD_WORK_INIT(work, fn) +#define DEFINE_KTHREAD_DELAYED_WORK(dwork, fn) \ + struct kthread_delayed_work dwork = \ + KTHREAD_DELAYED_WORK_INIT(dwork, fn) + /* * kthread_worker.lock needs its own lockdep class key when defined on * stack with lockdep enabled. Use the following macros in such cases. @@ -122,6 +141,15 @@ extern void __kthread_init_worker(struct kthread_worker *worker, (work)->func = (fn); \ } while (0) +#define kthread_init_delayed_work(dwork, fn) \ + do { \ + kthread_init_work(&(dwork)->work, (fn)); \ + __setup_timer(&(dwork)->timer, \ + kthread_delayed_work_timer_fn, \ + (unsigned long)(dwork), \ + TIMER_IRQSAFE); \ + } while (0) + int kthread_worker_fn(void *worker_ptr); __printf(1, 2) @@ -133,6 +161,11 @@ kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...); bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); + +bool kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay); + void kthread_flush_work(struct kthread_work *work); void kthread_flush_worker(struct kthread_worker *worker); diff --git a/kernel/kthread.c b/kernel/kthread.c index 524dfc8..a5c6d70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -563,6 +563,7 @@ void __kthread_init_worker(struct kthread_worker *worker, spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); + INIT_LIST_HEAD(&worker->delayed_work_list); worker->task = NULL; } EXPORT_SYMBOL_GPL(__kthread_init_worker); @@ -767,6 +768,107 @@ bool kthread_queue_work(struct kthread_worker *worker, } EXPORT_SYMBOL_GPL(kthread_queue_work); +/** + * kthread_delayed_work_timer_fn - callback that queues the associated kthread + * delayed work when the timer expires. + * @__data: pointer to the data associated with the timer + * + * The format of the function is defined by struct timer_list. + * It should have been called from irqsafe timer with irq already off. + */ +void kthread_delayed_work_timer_fn(unsigned long __data) +{ + struct kthread_delayed_work *dwork = + (struct kthread_delayed_work *)__data; + struct kthread_work *work = &dwork->work; + struct kthread_worker *worker = work->worker; + + /* + * This might happen when a pending work is reinitialized. + * It means that it is used a wrong way. + */ + if (WARN_ON_ONCE(!worker)) + return; + + spin_lock(&worker->lock); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + /* Move the work from worker->delayed_work_list. */ + WARN_ON_ONCE(list_empty(&work->node)); + list_del_init(&work->node); + kthread_insert_work(worker, work, &worker->work_list); + + spin_unlock(&worker->lock); +} +EXPORT_SYMBOL(kthread_delayed_work_timer_fn); + +void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct timer_list *timer = &dwork->timer; + struct kthread_work *work = &dwork->work; + + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + kthread_insert_work(worker, work, &worker->work_list); + return; + } + + /* Be paranoid and try to detect possible races already now. */ + kthread_insert_work_sanity_check(worker, work); + + list_add(&work->node, &worker->delayed_work_list); + work->worker = worker; + timer_stats_timer_set_start_info(&dwork->timer); + timer->expires = jiffies + delay; + add_timer(timer); +} + +/** + * kthread_queue_delayed_work - queue the associated kthread work + * after a delay. + * @worker: target kthread_worker + * @dwork: kthread_delayed_work to queue + * @delay: number of jiffies to wait before queuing + * + * If the work has not been pending it starts a timer that will queue + * the work after the given @delay. If @delay is zero, it queues the + * work immediately. + * + * Return: %false if the @work has already been pending. It means that + * either the timer was running or the work was queued. It returns %true + * otherwise. + */ +bool kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + if (list_empty(&work->node)) { + __kthread_queue_delayed_work(worker, dwork, delay); + ret = true; + } + + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); + struct kthread_flush_work { struct kthread_work work; struct completion done; -- cgit v0.10.2 From 37be45d49dec2a411e29d50c9597cfe8184b5645 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:43 -0700 Subject: kthread: allow to cancel kthread work We are going to use kthread workers more widely and sometimes we will need to make sure that the work is neither pending nor running. This patch implements cancel_*_sync() operations as inspired by workqueues. Well, we are synchronized against the other operations via the worker lock, we use del_timer_sync() and a counter to count parallel cancel operations. Therefore the implementation might be easier. First, we check if a worker is assigned. If not, the work has newer been queued after it was initialized. Second, we take the worker lock. It must be the right one. The work must not be assigned to another worker unless it is initialized in between. Third, we try to cancel the timer when it exists. The timer is deleted synchronously to make sure that the timer call back is not running. We need to temporary release the worker->lock to avoid a possible deadlock with the callback. In the meantime, we set work->canceling counter to avoid any queuing. Fourth, we try to remove the work from a worker list. It might be the list of either normal or delayed works. Fifth, if the work is running, we call kthread_flush_work(). It might take an arbitrary time. We need to release the worker-lock again. In the meantime, we again block any queuing by the canceling counter. As already mentioned, the check for a pending kthread work is done under a lock. In compare with workqueues, we do not need to fight for a single PENDING bit to block other operations. Therefore we do not suffer from the thundering storm problem and all parallel canceling jobs might use kthread_flush_work(). Any queuing is blocked until the counter gets zero. Link: http://lkml.kernel.org/r/1470754545-17632-10-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4acde1a..77435dc 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -77,6 +77,8 @@ struct kthread_work { struct list_head node; kthread_work_func_t func; struct kthread_worker *worker; + /* Number of canceling calls that are running at the moment. */ + int canceling; }; struct kthread_delayed_work { @@ -169,6 +171,9 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, void kthread_flush_work(struct kthread_work *work); void kthread_flush_worker(struct kthread_worker *worker); +bool kthread_cancel_work_sync(struct kthread_work *work); +bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); + void kthread_destroy_worker(struct kthread_worker *worker); #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index a5c6d70..20505c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -718,6 +718,19 @@ kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) } EXPORT_SYMBOL(kthread_create_worker_on_cpu); +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { @@ -759,7 +772,7 @@ bool kthread_queue_work(struct kthread_worker *worker, unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } @@ -859,7 +872,7 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } @@ -919,6 +932,121 @@ void kthread_flush_work(struct kthread_work *work) } EXPORT_SYMBOL_GPL(kthread_flush_work); +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, + unsigned long *flags) +{ + /* Try to cancel the timer if exists. */ + if (is_dwork) { + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; + } + + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} + +static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, is_dwork, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + kthread_flush_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work, false); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + +/** + * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and + * wait for it to finish. + * @dwork: the kthread delayed work to cancel + * + * This is kthread_cancel_work_sync() for delayed works. + * + * Return: %true if @dwork was pending, %false otherwise. + */ +bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) +{ + return __kthread_cancel_work_sync(&dwork->work, true); +} +EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); + /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush -- cgit v0.10.2 From 9a6b06c8d9a220860468aadb2f1c726570813bf9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:46 -0700 Subject: kthread: allow to modify delayed kthread work There are situations when we need to modify the delay of a delayed kthread work. For example, when the work depends on an event and the initial delay means a timeout. Then we want to queue the work immediately when the event happens. This patch implements kthread_mod_delayed_work() as inspired workqueues. It cancels the timer, removes the work from any worker list and queues it again with the given timeout. A very special case is when the work is being canceled at the same time. It might happen because of the regular kthread_cancel_delayed_work_sync() or by another kthread_mod_delayed_work(). In this case, we do nothing and let the other operation win. This should not normally happen as the caller is supposed to synchronize these operations a reasonable way. Link: http://lkml.kernel.org/r/1470754545-17632-11-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 77435dc..5c2ec2c 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -168,6 +168,10 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay); +bool kthread_mod_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay); + void kthread_flush_work(struct kthread_work *work); void kthread_flush_worker(struct kthread_worker *worker); diff --git a/kernel/kthread.c b/kernel/kthread.c index 20505c6..c1fcc63 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -976,6 +976,59 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, return false; } +/** + * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work + * @worker: kthread worker to use + * @dwork: kthread delayed work to queue + * @delay: number of jiffies to wait before queuing + * + * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is zero, + * @work is guaranteed to be queued immediately. + * + * Return: %true if @dwork was pending and its timer was modified, + * %false otherwise. + * + * A special case is when the work is being canceled in parallel. + * It might be caused either by the real kthread_cancel_delayed_work_sync() + * or yet another kthread_mod_delayed_work() call. We let the other command + * win and return %false here. The caller is supposed to synchronize these + * operations a reasonable way. + * + * This function is safe to call from any context including IRQ handler. + * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() + * for details. + */ +bool kthread_mod_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) +{ + struct kthread_work *work = &dwork->work; + unsigned long flags; + int ret = false; + + spin_lock_irqsave(&worker->lock, flags); + + /* Do not bother with canceling when never queued. */ + if (!work->worker) + goto fast_queue; + + /* Work must not be used with >1 worker, see kthread_queue_work() */ + WARN_ON_ONCE(work->worker != worker); + + /* Do not fight with another command that is canceling this work. */ + if (work->canceling) + goto out; + + ret = __kthread_cancel_work(work, true, &flags); +fast_queue: + __kthread_queue_delayed_work(worker, dwork, delay); +out: + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); + static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; -- cgit v0.10.2 From dbf52682cb02863d22b15e3742988c7c6e3f1710 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Tue, 11 Oct 2016 13:55:50 -0700 Subject: kthread: better support freezable kthread workers This patch allows to make kthread worker freezable via a new @flags parameter. It will allow to avoid an init work in some kthreads. It currently does not affect the function of kthread_worker_fn() but it might help to do some optimization or fixes eventually. I currently do not know about any other use for the @flags parameter but I believe that we will want more flags in the future. Finally, I hope that it will not cause confusion with @flags member in struct kthread. Well, I guess that we will want to rework the basic kthreads implementation once all kthreads are converted into kthread workers or workqueues. It is possible that we will merge the two structures. Link: http://lkml.kernel.org/r/1470754545-17632-12-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Thomas Gleixner Cc: Jiri Kosina Cc: Borislav Petkov Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 5c2ec2c..4f5235c 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -65,7 +65,12 @@ struct kthread_work; typedef void (*kthread_work_func_t)(struct kthread_work *work); void kthread_delayed_work_timer_fn(unsigned long __data); +enum { + KTW_FREEZABLE = 1 << 0, /* freeze during suspend */ +}; + struct kthread_worker { + unsigned int flags; spinlock_t lock; struct list_head work_list; struct list_head delayed_work_list; @@ -154,12 +159,13 @@ extern void __kthread_init_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); -__printf(1, 2) +__printf(2, 3) struct kthread_worker * -kthread_create_worker(const char namefmt[], ...); +kthread_create_worker(unsigned int flags, const char namefmt[], ...); struct kthread_worker * -kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...); +kthread_create_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[], ...); bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); diff --git a/kernel/kthread.c b/kernel/kthread.c index c1fcc63..be2cc1f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -560,11 +560,11 @@ void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { + memset(worker, 0, sizeof(struct kthread_worker)); spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); - worker->task = NULL; } EXPORT_SYMBOL_GPL(__kthread_init_worker); @@ -594,6 +594,10 @@ int kthread_worker_fn(void *worker_ptr) */ WARN_ON(worker->task && worker->task != current); worker->task = current; + + if (worker->flags & KTW_FREEZABLE) + set_freezable(); + repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ @@ -627,7 +631,8 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); static struct kthread_worker * -__kthread_create_worker(int cpu, const char namefmt[], va_list args) +__kthread_create_worker(int cpu, unsigned int flags, + const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; @@ -657,6 +662,7 @@ __kthread_create_worker(int cpu, const char namefmt[], va_list args) if (IS_ERR(task)) goto fail_task; + worker->flags = flags; worker->task = task; wake_up_process(task); return worker; @@ -668,6 +674,7 @@ fail_task: /** * kthread_create_worker - create a kthread worker + * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) @@ -675,13 +682,13 @@ fail_task: * when the worker was SIGKILLed. */ struct kthread_worker * -kthread_create_worker(const char namefmt[], ...) +kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(-1, namefmt, args); + worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; @@ -692,6 +699,7 @@ EXPORT_SYMBOL(kthread_create_worker); * kthread_create_worker_on_cpu - create a kthread worker and bind it * it to a given CPU and the associated NUMA node. * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker @@ -705,13 +713,14 @@ EXPORT_SYMBOL(kthread_create_worker); * when the worker was SIGKILLed. */ struct kthread_worker * -kthread_create_worker_on_cpu(int cpu, const char namefmt[], ...) +kthread_create_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(cpu, namefmt, args); + worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; -- cgit v0.10.2 From e154ccc831b5b52a9aa3fe881090bdaf1d80f062 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 11 Oct 2016 13:55:53 -0700 Subject: kthread: add kerneldoc for kthread_create() This macro is referenced in other kerneldoc comments, but lacks one of its own; fix that. Link: http://lkml.kernel.org/r/20160826072313.726a3485@lwn.net Signed-off-by: Jonathan Corbet Reported-by: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4f5235c..a6e82a6 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -10,6 +10,17 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), int node, const char namefmt[], ...); +/** + * kthread_create - create a kthread on the current node + * @threadfn: the function to run in the thread + * @data: data pointer for @threadfn() + * @namefmt: printf-style format string for the thread name + * @...: arguments for @namefmt. + * + * This macro will create a kthread on the current node, leaving it in + * the stopped state. This is just a helper for kthread_create_on_node(); + * see the documentation there for more details. + */ #define kthread_create(threadfn, data, namefmt, arg...) \ kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg) -- cgit v0.10.2 From 48a6d64edadbd40fa5185a890023e9b331d64a48 Mon Sep 17 00:00:00 2001 From: John Siddle Date: Tue, 11 Oct 2016 13:55:56 -0700 Subject: hung_task: allow hung_task_panic when hung_task_warnings is 0 Previously hung_task_panic would not be respected if enabled after hung_task_warnings had already been decremented to 0. Permit the kernel to panic if hung_task_panic is enabled after hung_task_warnings has already been decremented to 0 and another task hangs for hung_task_timeout_secs seconds. Check if hung_task_panic is enabled so we don't return prematurely, and check if hung_task_warnings is non-zero so we don't print the warning unnecessarily. [akpm@linux-foundation.org: fix off-by-one] Link: http://lkml.kernel.org/r/1473450214-4049-1-git-send-email-jsiddle@redhat.com Signed-off-by: John Siddle Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 432c3d7..2b59c82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,26 +98,26 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings) + if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) return; - if (sysctl_hung_task_warnings > 0) - sysctl_hung_task_warnings--; - /* * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ - pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); - pr_err(" %s %s %.*s\n", - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - debug_show_all_locks(); + if (sysctl_hung_task_warnings) { + sysctl_hung_task_warnings--; + pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", + t->comm, t->pid, timeout); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + debug_show_all_locks(); + } touch_nmi_watchdog(); -- cgit v0.10.2 From 97139d4a6f26445de47b378cddd5192c0278f863 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Oct 2016 13:55:58 -0700 Subject: treewide: remove redundant #include Kernel source files need not include explicitly because the top Makefile forces to include it with: -include $(srctree)/include/linux/kconfig.h This commit removes explicit includes except the following: * arch/s390/include/asm/facilities_src.h * tools/testing/radix-tree/linux/kernel.h These two are used for host programs. Link: http://lkml.kernel.org/r/1473656164-11929-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/include/asm/trusted_foundations.h b/arch/arm/include/asm/trusted_foundations.h index 624e1d4..0074835 100644 --- a/arch/arm/include/asm/trusted_foundations.h +++ b/arch/arm/include/asm/trusted_foundations.h @@ -26,7 +26,6 @@ #ifndef __ASM_ARM_TRUSTED_FOUNDATIONS_H #define __ASM_ARM_TRUSTED_FOUNDATIONS_H -#include #include #include #include diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 55101bd..39feb85 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -7,7 +7,6 @@ #ifndef __ASSEMBLY__ #include -#include #include #include #include diff --git a/arch/mips/include/asm/mach-loongson64/loongson.h b/arch/mips/include/asm/mach-loongson64/loongson.h index d1ff774..c68c0cc 100644 --- a/arch/mips/include/asm/mach-loongson64/loongson.h +++ b/arch/mips/include/asm/mach-loongson64/loongson.h @@ -14,7 +14,6 @@ #include #include #include -#include #include /* loongson internal northbridge initialization */ diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 36775d2..f8b7bf8 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -35,7 +35,6 @@ */ #include #include -#include #include #include diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 39e7b47..49a2e22 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 8114744..d433b1d 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "../tty/hvc/hvc_console.h" #define is_rproc_enabled IS_ENABLED(CONFIG_REMOTEPROC) diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c index a735806..e0b5a45 100644 --- a/drivers/input/rmi4/rmi_bus.c +++ b/drivers/input/rmi4/rmi_bus.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/drivers/input/rmi4/rmi_driver.c b/drivers/input/rmi4/rmi_driver.c index c83bce8..4a88312 100644 --- a/drivers/input/rmi4/rmi_driver.c +++ b/drivers/input/rmi4/rmi_driver.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c index fac81fc..b5d2dfc 100644 --- a/drivers/input/rmi4/rmi_f01.c +++ b/drivers/input/rmi4/rmi_f01.c @@ -8,7 +8,6 @@ */ #include -#include #include #include #include diff --git a/drivers/input/rmi4/rmi_f11.c b/drivers/input/rmi4/rmi_f11.c index 20c7134..f798f42 100644 --- a/drivers/input/rmi4/rmi_f11.c +++ b/drivers/input/rmi4/rmi_f11.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c index b844c89..daa4ae8 100644 --- a/drivers/irqchip/irq-bcm6345-l1.c +++ b/drivers/irqchip/irq-bcm6345-l1.c @@ -52,7 +52,6 @@ #include #include -#include #include #include #include diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index 0fea985..353c549 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -12,7 +12,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index 0ec9263..64c2692 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 1d4a5b4..bddf169 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/dvb-frontends/af9013.h b/drivers/media/dvb-frontends/af9013.h index 1dcc936..dcdd163 100644 --- a/drivers/media/dvb-frontends/af9013.h +++ b/drivers/media/dvb-frontends/af9013.h @@ -25,7 +25,6 @@ #ifndef AF9013_H #define AF9013_H -#include #include /* AF9013/5 GPIOs (mostly guessed) diff --git a/drivers/media/dvb-frontends/af9033.h b/drivers/media/dvb-frontends/af9033.h index 6ad22b6..5b83e4f 100644 --- a/drivers/media/dvb-frontends/af9033.h +++ b/drivers/media/dvb-frontends/af9033.h @@ -22,8 +22,6 @@ #ifndef AF9033_H #define AF9033_H -#include - /* * I2C address (TODO: are these in 8-bit format?) * 0x38, 0x3a, 0x3c, 0x3e diff --git a/drivers/media/dvb-frontends/ascot2e.h b/drivers/media/dvb-frontends/ascot2e.h index 6da4ae6..dc61bf7 100644 --- a/drivers/media/dvb-frontends/ascot2e.h +++ b/drivers/media/dvb-frontends/ascot2e.h @@ -22,7 +22,6 @@ #ifndef __DVB_ASCOT2E_H__ #define __DVB_ASCOT2E_H__ -#include #include #include diff --git a/drivers/media/dvb-frontends/atbm8830.h b/drivers/media/dvb-frontends/atbm8830.h index 5446d13..bb86238 100644 --- a/drivers/media/dvb-frontends/atbm8830.h +++ b/drivers/media/dvb-frontends/atbm8830.h @@ -22,7 +22,6 @@ #ifndef __ATBM8830_H__ #define __ATBM8830_H__ -#include #include #include diff --git a/drivers/media/dvb-frontends/au8522.h b/drivers/media/dvb-frontends/au8522.h index 78bf3f7..21c51a4 100644 --- a/drivers/media/dvb-frontends/au8522.h +++ b/drivers/media/dvb-frontends/au8522.h @@ -22,7 +22,6 @@ #ifndef __AU8522_H__ #define __AU8522_H__ -#include #include enum au8522_if_freq { diff --git a/drivers/media/dvb-frontends/cx22702.h b/drivers/media/dvb-frontends/cx22702.h index 68b69a7..a1956a9 100644 --- a/drivers/media/dvb-frontends/cx22702.h +++ b/drivers/media/dvb-frontends/cx22702.h @@ -28,7 +28,6 @@ #ifndef CX22702_H #define CX22702_H -#include #include struct cx22702_config { diff --git a/drivers/media/dvb-frontends/cx24113.h b/drivers/media/dvb-frontends/cx24113.h index 962919b..194c703 100644 --- a/drivers/media/dvb-frontends/cx24113.h +++ b/drivers/media/dvb-frontends/cx24113.h @@ -22,8 +22,6 @@ #ifndef CX24113_H #define CX24113_H -#include - struct dvb_frontend; struct cx24113_config { diff --git a/drivers/media/dvb-frontends/cx24116.h b/drivers/media/dvb-frontends/cx24116.h index f6dbabc..9ff8df8d 100644 --- a/drivers/media/dvb-frontends/cx24116.h +++ b/drivers/media/dvb-frontends/cx24116.h @@ -21,7 +21,6 @@ #ifndef CX24116_H #define CX24116_H -#include #include struct cx24116_config { diff --git a/drivers/media/dvb-frontends/cx24117.h b/drivers/media/dvb-frontends/cx24117.h index 1648ab4..445f13f 100644 --- a/drivers/media/dvb-frontends/cx24117.h +++ b/drivers/media/dvb-frontends/cx24117.h @@ -22,7 +22,6 @@ #ifndef CX24117_H #define CX24117_H -#include #include struct cx24117_config { diff --git a/drivers/media/dvb-frontends/cx24120.h b/drivers/media/dvb-frontends/cx24120.h index f097042..de4ca9a 100644 --- a/drivers/media/dvb-frontends/cx24120.h +++ b/drivers/media/dvb-frontends/cx24120.h @@ -20,7 +20,6 @@ #ifndef CX24120_H #define CX24120_H -#include #include #include diff --git a/drivers/media/dvb-frontends/cx24123.h b/drivers/media/dvb-frontends/cx24123.h index 975f3c9..aac2344 100644 --- a/drivers/media/dvb-frontends/cx24123.h +++ b/drivers/media/dvb-frontends/cx24123.h @@ -21,7 +21,6 @@ #ifndef CX24123_H #define CX24123_H -#include #include struct cx24123_config { diff --git a/drivers/media/dvb-frontends/cxd2820r.h b/drivers/media/dvb-frontends/cxd2820r.h index 56d4276..297a71a 100644 --- a/drivers/media/dvb-frontends/cxd2820r.h +++ b/drivers/media/dvb-frontends/cxd2820r.h @@ -22,7 +22,6 @@ #ifndef CXD2820R_H #define CXD2820R_H -#include #include #define CXD2820R_GPIO_D (0 << 0) /* disable */ diff --git a/drivers/media/dvb-frontends/cxd2841er.h b/drivers/media/dvb-frontends/cxd2841er.h index 62ad5f0..7f1acfb 100644 --- a/drivers/media/dvb-frontends/cxd2841er.h +++ b/drivers/media/dvb-frontends/cxd2841er.h @@ -22,7 +22,6 @@ #ifndef CXD2841ER_H #define CXD2841ER_H -#include #include enum cxd2841er_xtal { diff --git a/drivers/media/dvb-frontends/dib3000mc.h b/drivers/media/dvb-frontends/dib3000mc.h index b37e69e..67a6d50 100644 --- a/drivers/media/dvb-frontends/dib3000mc.h +++ b/drivers/media/dvb-frontends/dib3000mc.h @@ -13,8 +13,6 @@ #ifndef DIB3000MC_H #define DIB3000MC_H -#include - #include "dibx000_common.h" struct dib3000mc_config { diff --git a/drivers/media/dvb-frontends/dib7000m.h b/drivers/media/dvb-frontends/dib7000m.h index 6468c27..8f84dfa 100644 --- a/drivers/media/dvb-frontends/dib7000m.h +++ b/drivers/media/dvb-frontends/dib7000m.h @@ -1,8 +1,6 @@ #ifndef DIB7000M_H #define DIB7000M_H -#include - #include "dibx000_common.h" struct dib7000m_config { diff --git a/drivers/media/dvb-frontends/dib7000p.h b/drivers/media/dvb-frontends/dib7000p.h index baa2789..205fbbf 100644 --- a/drivers/media/dvb-frontends/dib7000p.h +++ b/drivers/media/dvb-frontends/dib7000p.h @@ -1,8 +1,6 @@ #ifndef DIB7000P_H #define DIB7000P_H -#include - #include "dibx000_common.h" struct dib7000p_config { diff --git a/drivers/media/dvb-frontends/drxd.h b/drivers/media/dvb-frontends/drxd.h index a47c22d..f0507cd 100644 --- a/drivers/media/dvb-frontends/drxd.h +++ b/drivers/media/dvb-frontends/drxd.h @@ -24,7 +24,6 @@ #ifndef _DRXD_H_ #define _DRXD_H_ -#include #include #include diff --git a/drivers/media/dvb-frontends/drxk.h b/drivers/media/dvb-frontends/drxk.h index 8f0b9ee..a629897 100644 --- a/drivers/media/dvb-frontends/drxk.h +++ b/drivers/media/dvb-frontends/drxk.h @@ -1,7 +1,6 @@ #ifndef _DRXK_H_ #define _DRXK_H_ -#include #include #include diff --git a/drivers/media/dvb-frontends/ds3000.h b/drivers/media/dvb-frontends/ds3000.h index 153169d..82e8c25 100644 --- a/drivers/media/dvb-frontends/ds3000.h +++ b/drivers/media/dvb-frontends/ds3000.h @@ -22,7 +22,6 @@ #ifndef DS3000_H #define DS3000_H -#include #include struct ds3000_config { diff --git a/drivers/media/dvb-frontends/dvb_dummy_fe.h b/drivers/media/dvb-frontends/dvb_dummy_fe.h index 15e4cea..50f1af5 100644 --- a/drivers/media/dvb-frontends/dvb_dummy_fe.h +++ b/drivers/media/dvb-frontends/dvb_dummy_fe.h @@ -22,7 +22,6 @@ #ifndef DVB_DUMMY_FE_H #define DVB_DUMMY_FE_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/ec100.h b/drivers/media/dvb-frontends/ec100.h index 9544bab..e894bdc 100644 --- a/drivers/media/dvb-frontends/ec100.h +++ b/drivers/media/dvb-frontends/ec100.h @@ -22,7 +22,6 @@ #ifndef EC100_H #define EC100_H -#include #include struct ec100_config { diff --git a/drivers/media/dvb-frontends/hd29l2.h b/drivers/media/dvb-frontends/hd29l2.h index 48e9ab7..a14d6f3 100644 --- a/drivers/media/dvb-frontends/hd29l2.h +++ b/drivers/media/dvb-frontends/hd29l2.h @@ -23,7 +23,6 @@ #ifndef HD29L2_H #define HD29L2_H -#include #include struct hd29l2_config { diff --git a/drivers/media/dvb-frontends/helene.h b/drivers/media/dvb-frontends/helene.h index e1b9224..3336154 100644 --- a/drivers/media/dvb-frontends/helene.h +++ b/drivers/media/dvb-frontends/helene.h @@ -21,7 +21,6 @@ #ifndef __DVB_HELENE_H__ #define __DVB_HELENE_H__ -#include #include #include diff --git a/drivers/media/dvb-frontends/horus3a.h b/drivers/media/dvb-frontends/horus3a.h index c1e2d18..672a556 100644 --- a/drivers/media/dvb-frontends/horus3a.h +++ b/drivers/media/dvb-frontends/horus3a.h @@ -22,7 +22,6 @@ #ifndef __DVB_HORUS3A_H__ #define __DVB_HORUS3A_H__ -#include #include #include diff --git a/drivers/media/dvb-frontends/ix2505v.h b/drivers/media/dvb-frontends/ix2505v.h index af107a2..5eab397 100644 --- a/drivers/media/dvb-frontends/ix2505v.h +++ b/drivers/media/dvb-frontends/ix2505v.h @@ -20,7 +20,6 @@ #ifndef DVB_IX2505V_H #define DVB_IX2505V_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/lg2160.h b/drivers/media/dvb-frontends/lg2160.h index d20bd90..8c74ddc 100644 --- a/drivers/media/dvb-frontends/lg2160.h +++ b/drivers/media/dvb-frontends/lg2160.h @@ -22,7 +22,6 @@ #ifndef _LG2160_H_ #define _LG2160_H_ -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/lgdt3305.h b/drivers/media/dvb-frontends/lgdt3305.h index f91a1b4..e7dceb6 100644 --- a/drivers/media/dvb-frontends/lgdt3305.h +++ b/drivers/media/dvb-frontends/lgdt3305.h @@ -22,7 +22,6 @@ #ifndef _LGDT3305_H_ #define _LGDT3305_H_ -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/lgs8gl5.h b/drivers/media/dvb-frontends/lgs8gl5.h index a5b3faf..f36a7fd 100644 --- a/drivers/media/dvb-frontends/lgs8gl5.h +++ b/drivers/media/dvb-frontends/lgs8gl5.h @@ -23,7 +23,6 @@ #ifndef LGS8GL5_H #define LGS8GL5_H -#include #include struct lgs8gl5_config { diff --git a/drivers/media/dvb-frontends/lgs8gxx.h b/drivers/media/dvb-frontends/lgs8gxx.h index 368c992..7519c02 100644 --- a/drivers/media/dvb-frontends/lgs8gxx.h +++ b/drivers/media/dvb-frontends/lgs8gxx.h @@ -26,7 +26,6 @@ #ifndef __LGS8GXX_H__ #define __LGS8GXX_H__ -#include #include #include diff --git a/drivers/media/dvb-frontends/lnbh24.h b/drivers/media/dvb-frontends/lnbh24.h index a088b8e..24431df 100644 --- a/drivers/media/dvb-frontends/lnbh24.h +++ b/drivers/media/dvb-frontends/lnbh24.h @@ -23,8 +23,6 @@ #ifndef _LNBH24_H #define _LNBH24_H -#include - /* system register bits */ #define LNBH24_OLF 0x01 #define LNBH24_OTF 0x02 diff --git a/drivers/media/dvb-frontends/lnbh25.h b/drivers/media/dvb-frontends/lnbh25.h index 1f329ef..f13fd03 100644 --- a/drivers/media/dvb-frontends/lnbh25.h +++ b/drivers/media/dvb-frontends/lnbh25.h @@ -22,7 +22,6 @@ #define LNBH25_H #include -#include #include /* 22 kHz tone enabled. Tone output controlled by DSQIN pin */ diff --git a/drivers/media/dvb-frontends/lnbp21.h b/drivers/media/dvb-frontends/lnbp21.h index cd9101f..4bb6439 100644 --- a/drivers/media/dvb-frontends/lnbp21.h +++ b/drivers/media/dvb-frontends/lnbp21.h @@ -27,8 +27,6 @@ #ifndef _LNBP21_H #define _LNBP21_H -#include - /* system register bits */ /* [RO] 0=OK; 1=over current limit flag */ #define LNBP21_OLF 0x01 diff --git a/drivers/media/dvb-frontends/lnbp22.h b/drivers/media/dvb-frontends/lnbp22.h index 5d01d92..0cb7212 100644 --- a/drivers/media/dvb-frontends/lnbp22.h +++ b/drivers/media/dvb-frontends/lnbp22.h @@ -28,8 +28,6 @@ #ifndef _LNBP22_H #define _LNBP22_H -#include - /* Enable */ #define LNBP22_EN 0x10 /* Voltage selection */ diff --git a/drivers/media/dvb-frontends/m88rs2000.h b/drivers/media/dvb-frontends/m88rs2000.h index de74301..1a313b0 100644 --- a/drivers/media/dvb-frontends/m88rs2000.h +++ b/drivers/media/dvb-frontends/m88rs2000.h @@ -20,7 +20,6 @@ #ifndef M88RS2000_H #define M88RS2000_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/mb86a20s.h b/drivers/media/dvb-frontends/mb86a20s.h index a113282..dfb02db 100644 --- a/drivers/media/dvb-frontends/mb86a20s.h +++ b/drivers/media/dvb-frontends/mb86a20s.h @@ -16,7 +16,6 @@ #ifndef MB86A20S_H #define MB86A20S_H -#include #include /** diff --git a/drivers/media/dvb-frontends/s5h1409.h b/drivers/media/dvb-frontends/s5h1409.h index f58b9ca..b38557c 100644 --- a/drivers/media/dvb-frontends/s5h1409.h +++ b/drivers/media/dvb-frontends/s5h1409.h @@ -22,7 +22,6 @@ #ifndef __S5H1409_H__ #define __S5H1409_H__ -#include #include struct s5h1409_config { diff --git a/drivers/media/dvb-frontends/s5h1411.h b/drivers/media/dvb-frontends/s5h1411.h index f3a87f7..791bab0 100644 --- a/drivers/media/dvb-frontends/s5h1411.h +++ b/drivers/media/dvb-frontends/s5h1411.h @@ -22,7 +22,6 @@ #ifndef __S5H1411_H__ #define __S5H1411_H__ -#include #include #define S5H1411_I2C_TOP_ADDR (0x32 >> 1) diff --git a/drivers/media/dvb-frontends/s5h1432.h b/drivers/media/dvb-frontends/s5h1432.h index f490c5e..b81c9bd 100644 --- a/drivers/media/dvb-frontends/s5h1432.h +++ b/drivers/media/dvb-frontends/s5h1432.h @@ -22,7 +22,6 @@ #ifndef __S5H1432_H__ #define __S5H1432_H__ -#include #include #define S5H1432_I2C_TOP_ADDR (0x02 >> 1) diff --git a/drivers/media/dvb-frontends/s921.h b/drivers/media/dvb-frontends/s921.h index f5b722d..a47ed89 100644 --- a/drivers/media/dvb-frontends/s921.h +++ b/drivers/media/dvb-frontends/s921.h @@ -17,7 +17,6 @@ #ifndef S921_H #define S921_H -#include #include struct s921_config { diff --git a/drivers/media/dvb-frontends/si21xx.h b/drivers/media/dvb-frontends/si21xx.h index ef5f351..b1be62f 100644 --- a/drivers/media/dvb-frontends/si21xx.h +++ b/drivers/media/dvb-frontends/si21xx.h @@ -1,7 +1,6 @@ #ifndef SI21XX_H #define SI21XX_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/sp2.h b/drivers/media/dvb-frontends/sp2.h index 6cceea0..3901cd7 100644 --- a/drivers/media/dvb-frontends/sp2.h +++ b/drivers/media/dvb-frontends/sp2.h @@ -17,7 +17,6 @@ #ifndef SP2_H #define SP2_H -#include #include "dvb_ca_en50221.h" /* diff --git a/drivers/media/dvb-frontends/stb6000.h b/drivers/media/dvb-frontends/stb6000.h index da581b6..78e75df 100644 --- a/drivers/media/dvb-frontends/stb6000.h +++ b/drivers/media/dvb-frontends/stb6000.h @@ -23,7 +23,6 @@ #ifndef __DVB_STB6000_H__ #define __DVB_STB6000_H__ -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/stv0288.h b/drivers/media/dvb-frontends/stv0288.h index b58603c..803acb9 100644 --- a/drivers/media/dvb-frontends/stv0288.h +++ b/drivers/media/dvb-frontends/stv0288.h @@ -27,7 +27,6 @@ #ifndef STV0288_H #define STV0288_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/stv0367.h b/drivers/media/dvb-frontends/stv0367.h index 92b3e85..b88166a 100644 --- a/drivers/media/dvb-frontends/stv0367.h +++ b/drivers/media/dvb-frontends/stv0367.h @@ -26,7 +26,6 @@ #ifndef STV0367_H #define STV0367_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/stv0900.h b/drivers/media/dvb-frontends/stv0900.h index c90bf00..9ca2da9 100644 --- a/drivers/media/dvb-frontends/stv0900.h +++ b/drivers/media/dvb-frontends/stv0900.h @@ -26,7 +26,6 @@ #ifndef STV0900_H #define STV0900_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/stv6110.h b/drivers/media/dvb-frontends/stv6110.h index f3c8a5c..4604f79 100644 --- a/drivers/media/dvb-frontends/stv6110.h +++ b/drivers/media/dvb-frontends/stv6110.h @@ -25,7 +25,6 @@ #ifndef __DVB_STV6110_H__ #define __DVB_STV6110_H__ -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/tda10048.h b/drivers/media/dvb-frontends/tda10048.h index bc77a73..a2cebb0 100644 --- a/drivers/media/dvb-frontends/tda10048.h +++ b/drivers/media/dvb-frontends/tda10048.h @@ -22,7 +22,6 @@ #ifndef TDA10048_H #define TDA10048_H -#include #include #include diff --git a/drivers/media/dvb-frontends/tda18271c2dd.h b/drivers/media/dvb-frontends/tda18271c2dd.h index 7ebd8ea..e6ccf24 100644 --- a/drivers/media/dvb-frontends/tda18271c2dd.h +++ b/drivers/media/dvb-frontends/tda18271c2dd.h @@ -1,8 +1,6 @@ #ifndef _TDA18271C2DD_H_ #define _TDA18271C2DD_H_ -#include - #if IS_REACHABLE(CONFIG_DVB_TDA18271C2DD) struct dvb_frontend *tda18271c2dd_attach(struct dvb_frontend *fe, struct i2c_adapter *i2c, u8 adr); diff --git a/drivers/media/dvb-frontends/ts2020.h b/drivers/media/dvb-frontends/ts2020.h index 9220e5c..facc54f 100644 --- a/drivers/media/dvb-frontends/ts2020.h +++ b/drivers/media/dvb-frontends/ts2020.h @@ -22,7 +22,6 @@ #ifndef TS2020_H #define TS2020_H -#include #include struct ts2020_config { diff --git a/drivers/media/dvb-frontends/zl10036.h b/drivers/media/dvb-frontends/zl10036.h index 670e76a..c568d8d 100644 --- a/drivers/media/dvb-frontends/zl10036.h +++ b/drivers/media/dvb-frontends/zl10036.h @@ -21,7 +21,6 @@ #ifndef DVB_ZL10036_H #define DVB_ZL10036_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/dvb-frontends/zl10039.h b/drivers/media/dvb-frontends/zl10039.h index 0709294..66e7085 100644 --- a/drivers/media/dvb-frontends/zl10039.h +++ b/drivers/media/dvb-frontends/zl10039.h @@ -22,8 +22,6 @@ #ifndef ZL10039_H #define ZL10039_H -#include - #if IS_REACHABLE(CONFIG_DVB_ZL10039) struct dvb_frontend *zl10039_attach(struct dvb_frontend *fe, u8 i2c_addr, diff --git a/drivers/media/pci/cx23885/altera-ci.h b/drivers/media/pci/cx23885/altera-ci.h index 6c51172..57a40c8 100644 --- a/drivers/media/pci/cx23885/altera-ci.h +++ b/drivers/media/pci/cx23885/altera-ci.h @@ -20,8 +20,6 @@ #ifndef __ALTERA_CI_H #define __ALTERA_CI_H -#include - #define ALT_DATA 0x000000ff #define ALT_TDI 0x00008000 #define ALT_TDO 0x00004000 diff --git a/drivers/media/tuners/fc0011.h b/drivers/media/tuners/fc0011.h index 81bb568..438cf89 100644 --- a/drivers/media/tuners/fc0011.h +++ b/drivers/media/tuners/fc0011.h @@ -1,7 +1,6 @@ #ifndef LINUX_FC0011_H_ #define LINUX_FC0011_H_ -#include #include "dvb_frontend.h" diff --git a/drivers/media/tuners/fc0012.h b/drivers/media/tuners/fc0012.h index 9ad3285..4a23e41 100644 --- a/drivers/media/tuners/fc0012.h +++ b/drivers/media/tuners/fc0012.h @@ -21,7 +21,6 @@ #ifndef _FC0012_H_ #define _FC0012_H_ -#include #include "dvb_frontend.h" #include "fc001x-common.h" diff --git a/drivers/media/tuners/fc0013.h b/drivers/media/tuners/fc0013.h index e130bd7..8c34105 100644 --- a/drivers/media/tuners/fc0013.h +++ b/drivers/media/tuners/fc0013.h @@ -22,7 +22,6 @@ #ifndef _FC0013_H_ #define _FC0013_H_ -#include #include "dvb_frontend.h" #include "fc001x-common.h" diff --git a/drivers/media/tuners/max2165.h b/drivers/media/tuners/max2165.h index 5054f01..aadd9fe 100644 --- a/drivers/media/tuners/max2165.h +++ b/drivers/media/tuners/max2165.h @@ -22,8 +22,6 @@ #ifndef __MAX2165_H__ #define __MAX2165_H__ -#include - struct dvb_frontend; struct i2c_adapter; diff --git a/drivers/media/tuners/mc44s803.h b/drivers/media/tuners/mc44s803.h index b3e614b..6b40df3 100644 --- a/drivers/media/tuners/mc44s803.h +++ b/drivers/media/tuners/mc44s803.h @@ -22,8 +22,6 @@ #ifndef MC44S803_H #define MC44S803_H -#include - struct dvb_frontend; struct i2c_adapter; diff --git a/drivers/media/tuners/mxl5005s.h b/drivers/media/tuners/mxl5005s.h index 5764b12..d842734 100644 --- a/drivers/media/tuners/mxl5005s.h +++ b/drivers/media/tuners/mxl5005s.h @@ -23,8 +23,6 @@ #ifndef __MXL5005S_H #define __MXL5005S_H -#include - #include #include "dvb_frontend.h" diff --git a/drivers/media/tuners/r820t.h b/drivers/media/tuners/r820t.h index b1e5661..fdcab91 100644 --- a/drivers/media/tuners/r820t.h +++ b/drivers/media/tuners/r820t.h @@ -21,7 +21,6 @@ #ifndef R820T_H #define R820T_H -#include #include "dvb_frontend.h" enum r820t_chip { diff --git a/drivers/media/tuners/si2157.h b/drivers/media/tuners/si2157.h index 5f1a60b..76807f5 100644 --- a/drivers/media/tuners/si2157.h +++ b/drivers/media/tuners/si2157.h @@ -17,7 +17,6 @@ #ifndef SI2157_H #define SI2157_H -#include #include #include "dvb_frontend.h" diff --git a/drivers/media/tuners/tda18212.h b/drivers/media/tuners/tda18212.h index e58c909..6391daf 100644 --- a/drivers/media/tuners/tda18212.h +++ b/drivers/media/tuners/tda18212.h @@ -21,7 +21,6 @@ #ifndef TDA18212_H #define TDA18212_H -#include #include "dvb_frontend.h" struct tda18212_config { diff --git a/drivers/media/tuners/tda18218.h b/drivers/media/tuners/tda18218.h index 1eacb4f..076b5f2 100644 --- a/drivers/media/tuners/tda18218.h +++ b/drivers/media/tuners/tda18218.h @@ -21,7 +21,6 @@ #ifndef TDA18218_H #define TDA18218_H -#include #include "dvb_frontend.h" struct tda18218_config { diff --git a/drivers/media/tuners/xc5000.h b/drivers/media/tuners/xc5000.h index 00ba29e..336bd49 100644 --- a/drivers/media/tuners/xc5000.h +++ b/drivers/media/tuners/xc5000.h @@ -22,7 +22,6 @@ #ifndef __XC5000_H__ #define __XC5000_H__ -#include #include struct dvb_frontend; diff --git a/drivers/media/usb/dvb-usb-v2/mxl111sf-demod.h b/drivers/media/usb/dvb-usb-v2/mxl111sf-demod.h index 7065aca..e6eae9d 100644 --- a/drivers/media/usb/dvb-usb-v2/mxl111sf-demod.h +++ b/drivers/media/usb/dvb-usb-v2/mxl111sf-demod.h @@ -21,7 +21,6 @@ #ifndef __MXL111SF_DEMOD_H__ #define __MXL111SF_DEMOD_H__ -#include #include "dvb_frontend.h" #include "mxl111sf.h" diff --git a/drivers/media/usb/dvb-usb-v2/mxl111sf-tuner.h b/drivers/media/usb/dvb-usb-v2/mxl111sf-tuner.h index 509b550..e96d9a4 100644 --- a/drivers/media/usb/dvb-usb-v2/mxl111sf-tuner.h +++ b/drivers/media/usb/dvb-usb-v2/mxl111sf-tuner.h @@ -21,7 +21,6 @@ #ifndef __MXL111SF_TUNER_H__ #define __MXL111SF_TUNER_H__ -#include #include "dvb_frontend.h" #include "mxl111sf.h" diff --git a/drivers/media/usb/dvb-usb/dibusb-common.c b/drivers/media/usb/dvb-usb/dibusb-common.c index 6eea4e6..9e94b17 100644 --- a/drivers/media/usb/dvb-usb/dibusb-common.c +++ b/drivers/media/usb/dvb-usb/dibusb-common.c @@ -9,7 +9,6 @@ * see Documentation/dvb/README.dvb-usb for more information */ -#include #include "dibusb.h" /* Max transfer size done by I2C transfer functions */ diff --git a/drivers/media/usb/hdpvr/hdpvr-video.c b/drivers/media/usb/hdpvr/hdpvr-video.c index 6d43d75..474c11e 100644 --- a/drivers/media/usb/hdpvr/hdpvr-video.c +++ b/drivers/media/usb/hdpvr/hdpvr-video.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 4f27048..d46e4ad 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index c1f34f0..fccdd49 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -30,7 +30,6 @@ #include #include #include -#include #include "mtdcore.h" diff --git a/drivers/net/dsa/b53/b53_mmap.c b/drivers/net/dsa/b53/b53_mmap.c index cc9e6bd..76fb855 100644 --- a/drivers/net/dsa/b53/b53_mmap.c +++ b/drivers/net/dsa/b53/b53_mmap.c @@ -17,7 +17,6 @@ */ #include -#include #include #include #include diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c index e15bf84..0ac449a 100644 --- a/drivers/net/ethernet/sun/ldmvsw.c +++ b/drivers/net/ethernet/sun/ldmvsw.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index 37ab46c..d2349a1 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c index 0b37ce9..ca31a57 100644 --- a/drivers/net/ethernet/wiznet/w5300.c +++ b/drivers/net/ethernet/wiznet/w5300.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c index 12731e6..ea73afb 100644 --- a/drivers/usb/early/ehci-dbgp.c +++ b/drivers/usb/early/ehci-dbgp.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/udc/bcm63xx_udc.c b/drivers/usb/gadget/udc/bcm63xx_udc.c index f5fccb3..f785032 100644 --- a/drivers/usb/gadget/udc/bcm63xx_udc.c +++ b/drivers/usb/gadget/udc/bcm63xx_udc.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 35af362..d793f54 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h index 2257a13..184a15e 100644 --- a/fs/lockd/procfs.h +++ b/fs/lockd/procfs.h @@ -6,8 +6,6 @@ #ifndef _LOCKD_PROCFS_H #define _LOCKD_PROCFS_H -#include - #if IS_ENABLED(CONFIG_PROC_FS) int lockd_create_procfs(void); void lockd_remove_procfs(void); diff --git a/include/linux/export.h b/include/linux/export.h index c565f87..d7df492 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -78,7 +78,6 @@ extern struct module __this_module; #elif defined(CONFIG_TRIM_UNUSED_KSYMS) -#include #include #define __EXPORT_SYMBOL(sym, sec) \ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 1f0be72..24e2cc5 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -8,7 +8,6 @@ #include #include #include -#include struct gpio_desc; struct of_phandle_args; diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index c68ff3d..e49121e 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -20,8 +20,6 @@ #include "main.h" -#include - struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" diff --git a/sound/soc/intel/common/sst-acpi.h b/sound/soc/intel/common/sst-acpi.h index 5d29493..0127422 100644 --- a/sound/soc/intel/common/sst-acpi.h +++ b/sound/soc/intel/common/sst-acpi.h @@ -12,7 +12,6 @@ * */ -#include #include #include diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c index 878daf3..7dc5a0a 100644 --- a/tools/testing/nvdimm/config_check.c +++ b/tools/testing/nvdimm/config_check.c @@ -1,4 +1,3 @@ -#include #include void check(void) -- cgit v0.10.2 From 5114a97a8bce7f4ead29a32b67dee85438699b9e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Oct 2016 13:56:01 -0700 Subject: fs: use mapping_set_error instead of opencoded set_bit The mapping_set_error() helper sets the correct AS_ flag for the mapping so there is no reason to open code it. Use the helper directly. [akpm@linux-foundation.org: be honest about conversion from -ENXIO to -EIO] Link: http://lkml.kernel.org/r/20160912111608.2588-2-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c index 5d79efc..046e84d 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ b/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -241,10 +241,7 @@ static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret obj->vob_discard_page_warned = 0; } else { SetPageError(vmpage); - if (ioret == -ENOSPC) - set_bit(AS_ENOSPC, &inode->i_mapping->flags); - else - set_bit(AS_EIO, &inode->i_mapping->flags); + mapping_set_error(inode->i_mapping, ioret); if ((ioret == -ESHUTDOWN || ioret == -EINTR) && obj->vob_discard_page_warned == 0) { diff --git a/fs/afs/write.c b/fs/afs/write.c index 14d506e..f865c3f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -398,8 +398,7 @@ no_more: switch (ret) { case -EDQUOT: case -ENOSPC: - set_bit(AS_ENOSPC, - &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -ENOSPC); break; case -EROFS: case -EIO: @@ -409,7 +408,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: afs_kill_pages(wb->vnode, true, first, last); - set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); + mapping_set_error(wb->vnode->vfs_inode.i_mapping, -EIO); break; case -EACCES: case -EPERM: diff --git a/fs/buffer.c b/fs/buffer.c index 7dad871..b205a62 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -351,7 +351,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); @@ -3249,7 +3249,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { if (buffer_write_io_error(bh) && page->mapping) - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d42ff52..d8072bc 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -778,7 +778,7 @@ try_again: fail: EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n", inode->i_ino, page->index, ret); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); unlock_page(page); return ret; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b4cbee9..0094923 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -88,7 +88,7 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_error) { SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); } bh = head = page_buffers(page); /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0d0177c..9ae194f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -75,7 +75,7 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_bit(AS_EIO, &page->mapping->flags); + mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5bb565f..31f8ca0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -269,8 +269,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, * filemap_fdatawait_range(), set it again so * that user process can get -EIO from fsync(). */ - set_bit(AS_EIO, - &jinode->i_vfs_inode->i_mapping->flags); + mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); if (!ret) ret = err; -- cgit v0.10.2 From 9c5d760b8d229b94c5030863a5edaee5f1a9d7b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 11 Oct 2016 13:56:04 -0700 Subject: mm: split gfp_mask and mapping flags into separate fields mapping->flags currently encodes two different things into a single flag. It contains sticky gfp_mask for page cache allocations and AS_ codes used to report errors/enospace and other states which are mapping specific. Condensing the two semantically unrelated things saves few bytes but it also complicates other things. For one thing the gfp flags space is reduced and in fact we are already running out of available bits. It can be assumed that more gfp flags will be necessary later on. To not introduce the address_space grow (at least on x86_64) we can stick it right after private_lock because we have a hole there. struct address_space { struct inode * host; /* 0 8 */ struct radix_tree_root page_tree; /* 8 16 */ spinlock_t tree_lock; /* 24 4 */ atomic_t i_mmap_writable; /* 28 4 */ struct rb_root i_mmap; /* 32 8 */ struct rw_semaphore i_mmap_rwsem; /* 40 40 */ /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ long unsigned int nrpages; /* 80 8 */ long unsigned int nrexceptional; /* 88 8 */ long unsigned int writeback_index; /* 96 8 */ const struct address_space_operations * a_ops; /* 104 8 */ long unsigned int flags; /* 112 8 */ spinlock_t private_lock; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ struct list_head private_list; /* 128 16 */ void * private_data; /* 144 8 */ /* size: 152, cachelines: 3, members: 14 */ /* sum members: 148, holes: 1, sum holes: 4 */ /* last cacheline: 24 bytes */ }; Link: http://lkml.kernel.org/r/20160912114852.GI14524@dhcp22.suse.cz Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/fs.h b/include/linux/fs.h index c145219..bc65d59 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -440,8 +440,9 @@ struct address_space { unsigned long nrexceptional; pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ - unsigned long flags; /* error bits/gfp mask */ + unsigned long flags; /* error bits */ spinlock_t private_lock; /* for use by the address_space */ + gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ } __attribute__((aligned(sizeof(long)))); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 747f401..dd15d39 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -16,17 +16,16 @@ #include /* - * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page - * allocation mode flags. + * Bits in mapping->flags. */ enum mapping_flags { - AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ - AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ - AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ - AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ - AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */ + AS_EIO = 0, /* IO error on async write */ + AS_ENOSPC = 1, /* ENOSPC on async write */ + AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */ + AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */ + AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ - AS_NO_WRITEBACK_TAGS = __GFP_BITS_SHIFT + 5, + AS_NO_WRITEBACK_TAGS = 5, }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -78,7 +77,7 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { - return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; + return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ @@ -94,8 +93,7 @@ static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { - m->flags = (m->flags & ~(__force unsigned long)__GFP_BITS_MASK) | - (__force unsigned long)mask; + m->gfp_mask = mask; } void release_pages(struct page **pages, int nr, bool cold); -- cgit v0.10.2