From 1edf223485c42c99655dcd001db1e46ad5e5d2d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:06:57 -0800 Subject: mm/page-writeback.c: make determine_dirtyable_memory static again The tracing ring-buffer used this function briefly, but not anymore. Make it local to the writeback code again. Also, move the function so that no forward declaration needs to be reintroduced. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a378c29..34a0055 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -138,8 +138,6 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern unsigned long determine_dirtyable_memory(void); - extern int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8616ef3..c081bf6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -130,6 +130,66 @@ unsigned long global_dirty_limit; static struct prop_descriptor vm_completions; /* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) @@ -196,7 +256,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write, return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -291,67 +350,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { -- cgit v0.10.2 From 34dbc67a644f11ab3475d822d72e25409911e760 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:06:59 -0800 Subject: vmscan: promote shared file mapped pages Commit 645747462435 ("vmscan: detect mapped file pages used only once") greatly decreases lifetime of single-used mapped file pages. Unfortunately it also decreases life time of all shared mapped file pages. Because after commit bf3f3bc5e7347 ("mm: don't mark_page_accessed in fault path") page-fault handler does not mark page active or even referenced. Thus page_check_references() activates file page only if it was used twice while it stays in inactive list, meanwhile it activates anon pages after first access. Inactive list can be small enough, this way reclaimer can accidentally throw away any widely used page if it wasn't used twice in short period. After this patch page_check_references() also activate file mapped page at first inactive list scan if this page is already used multiple times via several ptes. I found this while trying to fix degragation in rhel6 (~2.6.32) from rhel5 (~2.6.18). There a complete mess with >100 web/mail/spam/ftp containers, they share all their files but there a lot of anonymous pages: ~500mb shared file mapped memory and 15-20Gb non-shared anonymous memory. In this situation major-pagefaults are very costly, because all containers share the same page. In my load kernel created a disproportionate pressure on the file memory, compared with the anonymous, they equaled only if I raise swappiness up to 150 =) These patches actually wasn't helped a lot in my problem, but I saw noticable (10-20 times) reduce in count and average time of major-pagefault in file-mapped areas. Actually both patches are fixes for commit v2.6.33-5448-g6457474, because it was aimed at one scenario (singly used pages), but it breaks the logic in other scenarios (shared and/or executable pages) Signed-off-by: Konstantin Khlebnikov Acked-by: Pekka Enberg Acked-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 11adc89..753c1e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -715,7 +715,7 @@ static enum page_references page_check_references(struct page *page, */ SetPageReferenced(page); - if (referenced_page) + if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; -- cgit v0.10.2 From c909e99364c8b6ca07864d752950b6b4ecf6bef4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:03 -0800 Subject: vmscan: activate executable pages after first usage Logic added in commit 8cab4754d24a0 ("vmscan: make mapped executable pages the first class citizen") was noticeably weakened in commit 645747462435d84 ("vmscan: detect mapped file pages used only once"). Currently these pages can become "first class citizens" only after second usage. After this patch page_check_references() will activate they after first usage, and executable code gets yet better chance to stay in memory. Signed-off-by: Konstantin Khlebnikov Cc: Pekka Enberg Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 753c1e6..753a2dc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -718,6 +718,12 @@ static enum page_references page_check_references(struct page *page, if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + return PAGEREF_KEEP; } -- cgit v0.10.2 From cc59850ef940e4ee6a765d28b439b9bafe07cf63 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:04 -0800 Subject: mm: add free_hot_cold_page_list() helper This patch adds helper free_hot_cold_page_list() to free list of 0-order pages. It frees pages directly from list without temporary page-vector. It also calls trace_mm_pagevec_free() to simulate pagevec_free() behaviour. bloat-o-meter: add/remove: 1/1 grow/shrink: 1/3 up/down: 267/-295 (-28) function old new delta free_hot_cold_page_list - 264 +264 get_page_from_freelist 2129 2132 +3 __pagevec_free 243 239 -4 split_free_page 380 373 -7 release_pages 606 510 -96 free_page_list 188 - -188 Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3a76faf..6562958 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -358,6 +358,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); +extern void free_hot_cold_page_list(struct list_head *list, int cold); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca1..cd0c95c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1189,6 +1189,19 @@ out: } /* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_pagevec_free(page, cold); + free_hot_cold_page(page, cold); + } +} + +/* * split_page takes a non-compound higher-order page, and splits it into * n (1<lru_lock, flags); - zone = NULL; - } - __pagevec_free(&pages_to_free); - pagevec_reinit(&pages_to_free); - } + list_add(&page->lru, &pages_to_free); } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); - pagevec_free(&pages_to_free); + free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 753a2dc..3d571df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -734,24 +734,6 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } -static noinline_for_stack void free_page_list(struct list_head *free_pages) -{ - struct pagevec freed_pvec; - struct page *page, *tmp; - - pagevec_init(&freed_pvec, 1); - - list_for_each_entry_safe(page, tmp, free_pages, lru) { - list_del(&page->lru); - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } - } - - pagevec_free(&freed_pvec); -} - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1015,7 +997,7 @@ keep_lumpy: if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) zone_set_flag(zone, ZONE_CONGESTED); - free_page_list(&free_pages); + free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); -- cgit v0.10.2 From da066ad3570b88e7dee82e76a06ee9a7adffcf0d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:06 -0800 Subject: mm: remove unused pagevec_free It not exported and now nobody uses it. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index bab82f4..ed17024 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_free(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, @@ -67,12 +66,6 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } -static inline void pagevec_free(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_free(pvec); -} - static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cd0c95c..6c77efb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2319,16 +2319,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { -- cgit v0.10.2 From b413d48aa70605701c0b395b2e350ca15f5d643a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:09 -0800 Subject: mm-tracepoint: rename page-free events Rename mm_page_free_direct into mm_page_free and mm_pagevec_free into mm_page_free_batched Since v2.6.33-5426-gc475dab the kernel triggers mm_page_free_direct for all freed pages, not only for directly freed. So, let's name it properly. For pages freed via page-list we also trigger mm_page_free_batched event. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt index aa82ee4..1948004 100644 --- a/Documentation/trace/events-kmem.txt +++ b/Documentation/trace/events-kmem.txt @@ -40,8 +40,8 @@ but the call_site can usually be used to extrapolate that information. ================== mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d -mm_page_free_direct page=%p pfn=%lu order=%d -mm_pagevec_free page=%p pfn=%lu order=%d cold=%d +mm_page_free page=%p pfn=%lu order=%d +mm_page_free_batched page=%p pfn=%lu order=%d cold=%d These four events deal with page allocation and freeing. mm_page_alloc is a simple indicator of page allocator activity. Pages may be allocated from @@ -53,13 +53,13 @@ amounts of activity imply high activity on the zone->lock. Taking this lock impairs performance by disabling interrupts, dirtying cache lines between CPUs and serialising many CPUs. -When a page is freed directly by the caller, the mm_page_free_direct event +When a page is freed directly by the caller, the only mm_page_free event is triggered. Significant amounts of activity here could indicate that the callers should be batching their activities. -When pages are freed using a pagevec, the mm_pagevec_free is -triggered. Broadly speaking, pages are taken off the LRU lock in bulk and -freed in batch with a pagevec. Significant amounts of activity here could +When pages are freed in batch, the also mm_page_free_batched is triggered. +Broadly speaking, pages are taken off the LRU lock in bulk and +freed in batch with a page list. Significant amounts of activity here could indicate that the system is under memory pressure and can also indicate contention on the zone->lru_lock. diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl index 7df50e8..0a120aa 100644 --- a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl @@ -17,8 +17,8 @@ use Getopt::Long; # Tracepoint events use constant MM_PAGE_ALLOC => 1; -use constant MM_PAGE_FREE_DIRECT => 2; -use constant MM_PAGEVEC_FREE => 3; +use constant MM_PAGE_FREE => 2; +use constant MM_PAGE_FREE_BATCHED => 3; use constant MM_PAGE_PCPU_DRAIN => 4; use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; use constant MM_PAGE_ALLOC_EXTFRAG => 6; @@ -223,10 +223,10 @@ EVENT_PROCESS: # Perl Switch() sucks majorly if ($tracepoint eq "mm_page_alloc") { $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; - } elsif ($tracepoint eq "mm_page_free_direct") { - $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; - } elsif ($tracepoint eq "mm_pagevec_free") { - $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; + } elsif ($tracepoint eq "mm_page_free") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE}++ + } elsif ($tracepoint eq "mm_page_free_batched") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}++; } elsif ($tracepoint eq "mm_page_pcpu_drain") { $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; @@ -336,8 +336,8 @@ sub dump_stats { $process_pid, $stats{$process_pid}->{MM_PAGE_ALLOC}, $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, - $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, - $stats{$process_pid}->{MM_PAGEVEC_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE_BATCHED}, $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, $stats{$process_pid}->{HIGH_PCPU_DRAINS}, $stats{$process_pid}->{HIGH_PCPU_REFILLS}, @@ -364,8 +364,8 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; - $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; - $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; + $perprocess{$process}->{MM_PAGE_FREE} += $perprocesspid{$process_pid}->{MM_PAGE_FREE}; + $perprocess{$process}->{MM_PAGE_FREE_BATCHED} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}; $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index a9c87ad..5f889f1 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -147,7 +147,7 @@ DEFINE_EVENT(kmem_free, kmem_cache_free, TP_ARGS(call_site, ptr) ); -TRACE_EVENT(mm_page_free_direct, +TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), @@ -169,7 +169,7 @@ TRACE_EVENT(mm_page_free_direct, __entry->order) ); -TRACE_EVENT(mm_pagevec_free, +TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page, int cold), diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c77efb..516ab62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -632,7 +632,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -1196,7 +1196,7 @@ void free_hot_cold_page_list(struct list_head *list, int cold) struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { - trace_mm_pagevec_free(page, cold); + trace_mm_page_free_batched(page, cold); free_hot_cold_page(page, cold); } } -- cgit v0.10.2 From 90a5d5af74f6570af063fb6bff33c6b2f8361bbc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:10 -0800 Subject: mm-tracepoint: fix documentation and examples We renamed the page-free mm tracepoints. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt index 87bee3c..058cc6c 100644 --- a/Documentation/trace/tracepoint-analysis.txt +++ b/Documentation/trace/tracepoint-analysis.txt @@ -93,14 +93,14 @@ By specifying the -a switch and analysing sleep, the system-wide events for a duration of time can be examined. $ perf stat -a \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ sleep 10 Performance counter stats for 'sleep 10': 9630 kmem:mm_page_alloc - 2143 kmem:mm_page_free_direct - 7424 kmem:mm_pagevec_free + 2143 kmem:mm_page_free + 7424 kmem:mm_page_free_batched 10.002577764 seconds time elapsed @@ -119,15 +119,15 @@ basis using set_ftrace_pid. Events can be activated and tracked for the duration of a process on a local basis using PCL such as follows. - $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free ./hackbench 10 + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched ./hackbench 10 Time: 0.909 Performance counter stats for './hackbench 10': 17803 kmem:mm_page_alloc - 12398 kmem:mm_page_free_direct - 4827 kmem:mm_pagevec_free + 12398 kmem:mm_page_free + 4827 kmem:mm_page_free_batched 0.973913387 seconds time elapsed @@ -146,8 +146,8 @@ to know what the standard deviation is. By and large, this is left to the performance analyst to do it by hand. In the event that the discrete event occurrences are useful to the performance analyst, then perf can be used. - $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct - -e kmem:mm_pagevec_free ./hackbench 10 + $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free + -e kmem:mm_page_free_batched ./hackbench 10 Time: 0.890 Time: 0.895 Time: 0.915 @@ -157,8 +157,8 @@ occurrences are useful to the performance analyst, then perf can be used. Performance counter stats for './hackbench 10' (5 runs): 16630 kmem:mm_page_alloc ( +- 3.542% ) - 11486 kmem:mm_page_free_direct ( +- 4.771% ) - 4730 kmem:mm_pagevec_free ( +- 2.325% ) + 11486 kmem:mm_page_free ( +- 4.771% ) + 4730 kmem:mm_page_free_batched ( +- 2.325% ) 0.982653002 seconds time elapsed ( +- 1.448% ) @@ -168,15 +168,15 @@ aggregation of discrete events, then a script would need to be developed. Using --repeat, it is also possible to view how events are fluctuating over time on a system-wide basis using -a and sleep. - $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ -a --repeat 10 \ sleep 1 Performance counter stats for 'sleep 1' (10 runs): 1066 kmem:mm_page_alloc ( +- 26.148% ) - 182 kmem:mm_page_free_direct ( +- 5.464% ) - 890 kmem:mm_pagevec_free ( +- 30.079% ) + 182 kmem:mm_page_free ( +- 5.464% ) + 890 kmem:mm_page_free_batched ( +- 30.079% ) 1.002251757 seconds time elapsed ( +- 0.005% ) @@ -220,8 +220,8 @@ were generating events within the kernel. To begin this sort of analysis, the data must be recorded. At the time of writing, this required root: $ perf record -c 1 \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ ./hackbench 10 Time: 0.894 [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] @@ -260,8 +260,8 @@ noticed that X was generating an insane amount of page allocations so let's look at it: $ perf record -c 1 -f \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ -p `pidof X` This was interrupted after a few seconds and diff --git a/tools/perf/Documentation/examples.txt b/tools/perf/Documentation/examples.txt index 8eb6c48..77f9527 100644 --- a/tools/perf/Documentation/examples.txt +++ b/tools/perf/Documentation/examples.txt @@ -17,8 +17,8 @@ titan:~> perf list kmem:kmem_cache_alloc_node [Tracepoint event] kmem:kfree [Tracepoint event] kmem:kmem_cache_free [Tracepoint event] - kmem:mm_page_free_direct [Tracepoint event] - kmem:mm_pagevec_free [Tracepoint event] + kmem:mm_page_free [Tracepoint event] + kmem:mm_page_free_batched [Tracepoint event] kmem:mm_page_alloc [Tracepoint event] kmem:mm_page_alloc_zone_locked [Tracepoint event] kmem:mm_page_pcpu_drain [Tracepoint event] @@ -29,15 +29,15 @@ measured. For example the page alloc/free properties of a 'hackbench run' are: titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc - -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10 + -e kmem:mm_page_free_batched -e kmem:mm_page_free ./hackbench 10 Time: 0.575 Performance counter stats for './hackbench 10': 13857 kmem:mm_page_pcpu_drain 27576 kmem:mm_page_alloc - 6025 kmem:mm_pagevec_free - 20934 kmem:mm_page_free_direct + 6025 kmem:mm_page_free_batched + 20934 kmem:mm_page_free 0.613972165 seconds time elapsed @@ -45,8 +45,8 @@ You can observe the statistical properties as well, by using the 'repeat the workload N times' feature of perf stat: titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e - kmem:mm_page_alloc -e kmem:mm_pagevec_free -e - kmem:mm_page_free_direct ./hackbench 10 + kmem:mm_page_alloc -e kmem:mm_page_free_batched -e + kmem:mm_page_free ./hackbench 10 Time: 0.627 Time: 0.644 Time: 0.564 @@ -57,8 +57,8 @@ You can observe the statistical properties as well, by using the 12920 kmem:mm_page_pcpu_drain ( +- 3.359% ) 25035 kmem:mm_page_alloc ( +- 3.783% ) - 6104 kmem:mm_pagevec_free ( +- 0.934% ) - 18376 kmem:mm_page_free_direct ( +- 4.941% ) + 6104 kmem:mm_page_free_batched ( +- 0.934% ) + 18376 kmem:mm_page_free ( +- 4.941% ) 0.643954516 seconds time elapsed ( +- 2.363% ) @@ -158,15 +158,15 @@ Or you can observe the whole system's page allocations for 10 seconds: titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e -kmem:mm_page_alloc -e kmem:mm_pagevec_free -e -kmem:mm_page_free_direct sleep 10 +kmem:mm_page_alloc -e kmem:mm_page_free_batched -e +kmem:mm_page_free sleep 10 Performance counter stats for 'sleep 10': 171585 kmem:mm_page_pcpu_drain 322114 kmem:mm_page_alloc - 73623 kmem:mm_pagevec_free - 254115 kmem:mm_page_free_direct + 73623 kmem:mm_page_free_batched + 254115 kmem:mm_page_free 10.000591410 seconds time elapsed @@ -174,15 +174,15 @@ Or observe how fluctuating the page allocations are, via statistical analysis done over ten 1-second intervals: titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e - kmem:mm_page_alloc -e kmem:mm_pagevec_free -e - kmem:mm_page_free_direct sleep 1 + kmem:mm_page_alloc -e kmem:mm_page_free_batched -e + kmem:mm_page_free sleep 1 Performance counter stats for 'sleep 1' (10 runs): 17254 kmem:mm_page_pcpu_drain ( +- 3.709% ) 34394 kmem:mm_page_alloc ( +- 4.617% ) - 7509 kmem:mm_pagevec_free ( +- 4.820% ) - 25653 kmem:mm_page_free_direct ( +- 3.672% ) + 7509 kmem:mm_page_free_batched ( +- 4.820% ) + 25653 kmem:mm_page_free ( +- 3.672% ) 1.058135029 seconds time elapsed ( +- 3.089% ) -- cgit v0.10.2 From 937a94c9db30a818baa5e2c09dbf4589251355c3 Mon Sep 17 00:00:00 2001 From: Jacobo Giralt Date: Tue, 10 Jan 2012 15:07:11 -0800 Subject: mm: migrate: one less atomic operation migrate_page_move_mapping() drops a reference from the old page after unfreezing its counter. Both operations can be merged into a single atomic operation by directly unfreezing to one less reference. The same applies to migrate_huge_page_move_mapping(). Signed-off-by: Jacobo Giralt Cc: Mel Gorman Cc: Minchan Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 177aca4..594dc37 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -269,12 +269,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); /* - * Drop cache reference from old page. + * Drop cache reference from old page by unfreezing + * to one less reference. * We know this isn't the last reference. */ - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); /* * If moved to a different zone then also account @@ -334,9 +334,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); - - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); return 0; -- cgit v0.10.2 From 938929f14cb595f43cd1a4e63e22d36cab1e4a1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:14 -0800 Subject: mm: reduce the amount of work done when updating min_free_kbytes When min_free_kbytes is updated, some pageblocks are marked MIGRATE_RESERVE. Ordinarily, this work is unnoticable as it happens early in boot but on large machines with 1TB of memory, this has been reported to delay boot times, probably due to the NUMA distances involved. The bulk of the work is due to calling calling pageblock_is_reserved() an unnecessary amount of times and accessing far more struct page metadata than is necessary. This patch significantly reduces the amount of work done by setup_zone_migrate_reserve() improving boot times on 1TB machines. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 516ab62..671e6c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3388,25 +3388,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* -- cgit v0.10.2 From f90ac3982a78d36f894824636beeef13361d7c59 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:15 -0800 Subject: mm: avoid livelock on !__GFP_FS allocations Colin Cross reported; Under the following conditions, __alloc_pages_slowpath can loop forever: gfp_mask & __GFP_WAIT is true gfp_mask & __GFP_FS is false reclaim and compaction make no progress order <= PAGE_ALLOC_COSTLY_ORDER These conditions happen very often during suspend and resume, when pm_restrict_gfp_mask() effectively converts all GFP_KERNEL allocations into __GFP_WAIT. The oom killer is not run because gfp_mask & __GFP_FS is false, but should_alloc_retry will always return true when order is less than PAGE_ALLOC_COSTLY_ORDER. In his fix, he avoided retrying the allocation if reclaim made no progress and __GFP_FS was not set. The problem is that this would result in GFP_NOIO allocations failing that previously succeeded which would be very unfortunate. The big difference between GFP_NOIO and suspend converting GFP_KERNEL to behave like GFP_NOIO is that normally flushers will be cleaning pages and kswapd reclaims pages allowing GFP_NOIO to succeed after a short delay. The same does not necessarily apply during suspend as the storage device may be suspended. This patch special cases the suspend case to fail the page allocation if reclaim cannot make progress and adds some documentation on how gfp_allowed_mask is currently used. Failing allocations like this may cause suspend to abort but that is better than a livelock. [mgorman@suse.de: Rework fix to be suspend specific] [rientjes@google.com: Move suspended device check to should_alloc_retry] Reported-by: Colin Cross Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Minchan Kim Cc: Pekka Enberg Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6562958..91812df 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -368,9 +368,25 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +/* + * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what + * GFP flags are used before interrupts are enabled. Once interrupts are + * enabled, it is set to __GFP_BITS_MASK while the system is running. During + * hibernation, it is used by PM to avoid I/O during memory allocation while + * devices are suspended. + */ extern gfp_t gfp_allowed_mask; extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +#ifdef CONFIG_PM_SLEEP +extern bool pm_suspended_storage(void); +#else +static inline bool pm_suspended_storage(void) +{ + return false; +} +#endif /* CONFIG_PM_SLEEP */ + #endif /* __LINUX_GFP_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 671e6c9..3cba4b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -127,6 +127,13 @@ void pm_restrict_gfp_mask(void) saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -1786,12 +1793,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@ -1810,13 +1830,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@ -2209,7 +2222,8 @@ rebalance: /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; diff --git a/mm/swapfile.c b/mm/swapfile.c index b1cd120..9520592 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibernation clears bits from gfp_allowed_mask to prevent - * memory reclaim from writing to disk, so check that here. + * Hibration suspends storage while it is writing the image + * to disk so check that here. */ - if (!(gfp_allowed_mask & __GFP_IO)) + if (pm_suspended_storage()) return 0; delete_from_swap_cache(page); -- cgit v0.10.2 From 5f8aefd44e64ed2f6950a1dcc77309b7dd9979f4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:18 -0800 Subject: mm: account reaped page cache on inode cache pruning Inode cache pruning indirectly reclaims page-cache by invalidating mapping pages. Let's account them into reclaim-state to notice this progress in memory reclaimer. Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/inode.c b/fs/inode.c index 8753575..4fa4f09 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&sb->s_inode_lru_lock); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; dispose_list(&freeable); } -- cgit v0.10.2 From a734bcc812146cfba530e1adaf609fce1357982e Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:07:20 -0800 Subject: hugetlb: detect race upon page allocation failure during COW Currently we are not rechecking pte_same in hugetlb_cow after we take ptl lock again in the page allocation failure code path and simply retry again. This is not an issue at the moment because hugetlb fault path is protected by hugetlb_instantiation_mutex so we cannot race. The original page is locked and so we cannot race even with the page migration. Let's add the pte_same check anyway as we want to be consistent with the other check later in this function and be safe if we ever remove the mutex. [mhocko@suse.cz: reworded the changelog] Signed-off-by: Hillf Danton Signed-off-by: Michal Hocko Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7acd125..2c551b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2408,7 +2408,14 @@ retry_avoidcopy: BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); spin_lock(&mm->page_table_lock); - goto retry_avoidcopy; + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page_table_lock, and + * our job is done. + */ + return 0; } WARN_ON_ONCE(1); } -- cgit v0.10.2 From ef009b25f4f8a77d2b32067d424d5ac757dcdc5b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2012 15:07:21 -0800 Subject: hugetlb: clarify hugetlb_instantiation_mutex usage Let's make it clear that we cannot race with other fault handlers due to hugetlb (global) mutex. Also make it clear that we want to keep pte_same checks anayway to have a transition from the global mutex easier. Signed-off-by: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c551b2..49e693b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2349,6 +2349,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. + * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, -- cgit v0.10.2 From 1e16a539ac16e7b3a8c2cee188897d4bdb88e6e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:07:22 -0800 Subject: mm/hugetlb.c: fix virtual address handling in hugetlb fault handle_mm_fault() passes 'faulted' address to hugetlb_fault(). This address is not aligned to a hugepage boundary. Most of the functions for hugetlb pages are aware of that and calculate an alignment themselves. However some functions such as copy_user_huge_page() and clear_huge_page() don't handle alignment by themselves. This patch make hugeltb_fault() fix the alignment and pass an aligned addresss (to address of a faulted hugepage) to functions. [akpm@linux-foundation.org: use &=] Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 49e693b..ab89d6f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2640,6 +2640,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + address &= huge_page_mask(h); + ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); -- cgit v0.10.2 From 1399ff86f2a2bbacbbe68fa00c5f8c752b344723 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:07:25 -0800 Subject: kernel.h: add BUILD_BUG() macro We can place this in definitions that we expect the compiler to remove by dead code elimination. If this assertion fails, we get a nice error message at build time. The GCC function attribute error("message") was added in version 4.3, so we define a new macro __linktime_error(message) to expand to this for GCC-4.3 and later. This will give us an error diagnostic from the compiler on the line that fails. For other compilers __linktime_error(message) expands to nothing, and we have to be content with a link time error, but at least we will still get a build error. BUILD_BUG() expands to the undefined function __build_bug_failed() and will fail at link time if the compiler ever emits code for it. On GCC-4.3 and later, attribute((error())) is used so that the failure will be noted at compile time instead. Signed-off-by: David Daney Acked-by: David Rientjes Cc: DM Cc: Ralf Baechle Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index dfadc96..2f40791 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -29,6 +29,7 @@ the kernel context */ #define __cold __attribute__((__cold__)) +#define __linktime_error(message) __attribute__((__error__(message))) #if __GNUC_MINOR__ >= 5 /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 320d6c9..4a24354 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -293,7 +293,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #ifndef __compiletime_error # define __compiletime_error(message) #endif - +#ifndef __linktime_error +# define __linktime_error(message) +#endif /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8b1597..f48e8a5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -665,6 +665,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define BUILD_BUG_ON_ZERO(e) (0) #define BUILD_BUG_ON_NULL(e) ((void*)0) #define BUILD_BUG_ON(condition) +#define BUILD_BUG() (0) #else /* __CHECKER__ */ /* Force a compilation error if a constant expression is not a power of 2 */ @@ -703,6 +704,21 @@ extern int __build_bug_on_failed; if (condition) __build_bug_on_failed = 1; \ } while(0) #endif + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() \ + do { \ + extern void __build_bug_failed(void) \ + __linktime_error("BUILD_BUG failed"); \ + __build_bug_failed(); \ + } while (0) + #endif /* __CHECKER__ */ /* Trap pasters of __FUNCTION__ at compile-time */ -- cgit v0.10.2 From c0a32fc5a2e470d0b02597b23ad79a317735253e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:28 -0800 Subject: mm: more intensive memory corruption debugging With CONFIG_DEBUG_PAGEALLOC configured, the CPU will generate an exception on access (read,write) to an unallocated page, which permits us to catch code which corrupts memory. However the kernel is trying to maximise memory usage, hence there are usually few free pages in the system and buggy code usually corrupts some crucial data. This patch changes the buddy allocator to keep more free/protected pages and to interlace free/protected and allocated pages to increase the probability of catching corruption. When the kernel is compiled with CONFIG_DEBUG_PAGEALLOC, debug_guardpage_minorder defines the minimum order used by the page allocator to grant a request. The requested size will be returned with the remaining pages used as guard pages. The default value of debug_guardpage_minorder is zero: no change from current behaviour. [akpm@linux-foundation.org: tweak documentation, s/flg/flag/] Signed-off-by: Stanislaw Gruszka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7b2e5c5..7ed7030 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -623,6 +623,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted. no_debug_objects [KNL] Disable object debugging + debug_guardpage_minorder= + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + parameter allows control of the order of pages that will + be intentionally kept free (and hence protected) by the + buddy allocator. Bigger value increase the probability + of catching random memory corruption, but reduce the + amount of memory for normal system use. The maximum + possible value is MAX_ORDER/2. Setting this parameter + to 1 or 2 should be enough to identify most random + memory corruption problems caused by bugs in kernel or + driver code when a CPU writes to (or reads from) a + random memory location. Note that there exists a class + of memory corruptions problems caused by buggy H/W or + F/W or by drivers badly programing DMA (basically when + memory is written at bus level and the CPU MMU is + bypassed) which are not detectable by + CONFIG_DEBUG_PAGEALLOC, hence this option will not help + tracking down these problems. + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d9b4c9..5568553 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1618,5 +1618,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_DEBUG_PAGEALLOC +extern unsigned int _debug_guardpage_minorder; + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool page_is_guard(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool page_is_guard(struct page *page) { return false; } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h index b0638fd..22691f61 100644 --- a/include/linux/page-debug-flags.h +++ b/include/linux/page-debug-flags.h @@ -13,6 +13,7 @@ enum page_debug_flags { PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ + PAGE_DEBUG_FLAG_GUARD, }; /* @@ -21,7 +22,8 @@ enum page_debug_flags { */ #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS -#if !defined(CONFIG_PAGE_POISONING) \ +#if !defined(CONFIG_PAGE_POISONING) && \ + !defined(CONFIG_PAGE_GUARD) \ /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! #endif diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 8b1a477..4b24432 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC + select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types @@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS config PAGE_POISONING bool select WANT_PAGE_DEBUG_FLAGS + +config PAGE_GUARD + bool + select WANT_PAGE_DEBUG_FLAGS diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3cba4b6..93baebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -388,6 +389,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -445,6 +477,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@ -501,11 +538,19 @@ static inline void __free_one_page(struct page *page, buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@ -731,6 +776,23 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -1754,7 +1816,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; /* -- cgit v0.10.2 From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:31 -0800 Subject: PM/Hibernate: do not count debug pages as savable When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder > 0, we have lot of free pages that are not marked so. Snapshot code account them as savable, what cause hibernate memory preallocation failure. It is pretty hard to make hibernate allocation succeed with debug_guardpage_minorder=1. This change at least make it possible when system has relatively big amount of RAM. Signed-off-by: Stanislaw Gruszka Acked-by: Rafael J. Wysocki Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c14..1cf8890 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } -- cgit v0.10.2 From fc8d8620d39dbbaf412b1b9247d77d196d92adb9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:32 -0800 Subject: slub: min order when debug_guardpage_minorder > 0 Disable slub debug facilities and allocate slabs at minimal order when debug_guardpage_minorder > 0 to increase probability to catch random memory corruption by cpu exception. Signed-off-by: Stanislaw Gruszka Cc: "Rafael J. Wysocki" Cc: Andrea Arcangeli Acked-by: Christoph Lameter Cc: Mel Gorman Cc: Stanislaw Gruszka Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 025f6ac..d99acbf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3654,6 +3654,9 @@ void __init kmem_cache_init(void) struct kmem_cache *temp_kmem_cache_node; unsigned long kmalloc_size; + if (debug_guardpage_minorder()) + slub_max_order = 0; + kmem_size = offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *); -- cgit v0.10.2 From ad8a1b558e6c76fb53901956d3c8f29b82a4ccfa Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Tue, 10 Jan 2012 15:07:35 -0800 Subject: fadvise: only initiate writeback for specified range with FADV_DONTNEED Previously POSIX_FADV_DONTNEED would start writeback for the entire file when the bdi was not write congested. This negatively impacts performance if the file contains dirty pages outside of the requested range. This change uses __filemap_fdatawrite_range() to only initiate writeback for the requested range. Signed-off-by: Shawn Bohrer Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/fadvise.c b/mm/fadvise.c index 8d723c9..469491e0 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) - filemap_flush(mapping); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* First and last FULL page! */ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; -- cgit v0.10.2 From f6d7e0cb3ecc248e98fa11d83253f6174bd7e085 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 10 Jan 2012 15:07:38 -0800 Subject: mm, debug: test for online nid when allocating on single node Calling alloc_pages_exact_node() means the allocation only passes the zonelist of a single node into the page allocator. If that node isn't online, it's zonelist may never have been initialized causing a strange oops that may not immediately be clear. I recently debugged an issue where node 0 wasn't online and an allocator was passing 0 to alloc_pages_exact_node() and it resulted in a NULL pointer on zonelist->_zoneref. If CONFIG_DEBUG_VM is enabled, though, it would be nice to catch this a bit earlier. Signed-off-by: David Rientjes Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 91812df..66f172f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -313,7 +313,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, unsigned int order) { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } -- cgit v0.10.2 From 25bd91bd27820d5971258cecd1c0e64b0e485144 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:07:40 -0800 Subject: vmscan: add task name to warn_scan_unevictable() messages If we need to know a usecase, caller program name is critical important. Show it. Signed-off-by: KOSAKI Motohiro Acked-by: Johannes Weiner David Rientjes Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 3d571df..974162c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3436,9 +3436,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) static void warn_scan_unevictable_pages(void) { printk_once(KERN_WARNING - "The scan_unevictable_pages sysctl/node-interface has been " + "%s: The scan_unevictable_pages sysctl/node-interface has been " "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n"); + "one, please send an email to linux-mm@kvack.org.\n", + current->comm); } /* -- cgit v0.10.2 From ab8fabd46f811d5153d8a0cd2fac9a0d41fb593d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:42 -0800 Subject: mm: exclude reserved pages from dirtyable memory Per-zone dirty limits try to distribute page cache pages allocated for writing across zones in proportion to the individual zone sizes, to reduce the likelihood of reclaim having to write back individual pages from the LRU lists in order to make progress. This patch: The amount of dirtyable pages should not include the full number of free pages: there is a number of reserved pages that the page allocator and kswapd always try to keep free. The closer (reclaimable pages - dirty pages) is to the number of reserved pages, the more likely it becomes for reclaim to run into dirty pages: +----------+ --- | anon | | +----------+ | | | | | | -- dirty limit new -- flusher new | file | | | | | | | | | -- dirty limit old -- flusher old | | | +----------+ --- reclaim | reserved | +----------+ | kernel | +----------+ This patch introduces a per-zone dirty reserve that takes both the lowmem reserve as well as the high watermark of the zone into account, and a global sum of those per-zone values that is subtracted from the global amount of dirtyable pages. The lowmem reserve is unavailable to page cache allocations and kswapd tries to keep the high watermark free. We don't want to end up in a situation where reclaim has to clean pages in order to balance zones. Not treating reserved pages as dirtyable on a global level is only a conceptual fix. In reality, dirty pages are not distributed equally across zones and reclaim runs into dirty pages on a regular basis. But it is important to get this right before tackling the problem on a per-zone level, where the distance between reclaim and the dirty pages is mostly much smaller in absolute numbers. [akpm@linux-foundation.org: fix highmem build] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Michal Hocko Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ac040f..ca6ca92 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -317,6 +317,12 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; + /* + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. + */ + unsigned long dirty_balance_reserve; + #ifdef CONFIG_NUMA int node; /* diff --git a/include/linux/swap.h b/include/linux/swap.h index 1e22e12..06061a7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -207,6 +207,7 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern unsigned long dirty_balance_reserve; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c081bf6..9ab6de8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -157,7 +157,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); + zone_reclaimable_pages(z) - z->dirty_balance_reserve; } /* * Make sure that the number of highmem pages is never larger @@ -181,7 +181,8 @@ static unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - + dirty_balance_reserve; if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 93baebc..2cb9eb7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -97,6 +97,14 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -4822,8 +4830,19 @@ static void calculate_totalreserve_pages(void) if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } -- cgit v0.10.2 From ccafa2879fb8d13b8031337a8743eac4189e5d6e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:44 -0800 Subject: mm: writeback: cleanups in preparation for per-zone dirty limits The next patch will introduce per-zone dirty limiting functions in addition to the traditional global dirty limiting. Rename determine_dirtyable_memory() to global_dirtyable_memory() before adding the zone-specific version, and fix up its documentation. Also, move the functions to determine the dirtyable memory and the function to calculate the dirty limit based on that together so that their relationship is more apparent and that they can be commented on as a group. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9ab6de8..433fa99 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -146,6 +146,7 @@ static struct prop_descriptor vm_completions; * We make sure that the background writeout level is below the adjusted * clamping level. */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -172,12 +173,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) } /** - * determine_dirtyable_memory - amount of memory that may be used + * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. */ -static unsigned long determine_dirtyable_memory(void) +unsigned long global_dirtyable_memory(void) { unsigned long x; @@ -191,6 +192,47 @@ static unsigned long determine_dirtyable_memory(void) } /* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + unsigned long background; + unsigned long dirty; + unsigned long uninitialized_var(available_memory); + struct task_struct *tsk; + + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = global_dirtyable_memory(); + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + +/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) @@ -202,7 +244,7 @@ static int calc_period_shift(void) if (vm_dirty_bytes) dirty_total = vm_dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -362,47 +404,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) return max(thresh, global_dirty_limit); } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds - * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) -{ - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); - struct task_struct *tsk; - - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = determine_dirtyable_memory(); - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else - dirty = (vm_dirty_ratio * available_memory) / 100; - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); -} - /** * bdi_dirty_limit - @bdi's share of dirty throttling threshold * @bdi: the backing_dev_info to query -- cgit v0.10.2 From a756cf5908530e8b40bdf569eb48b40139e8d7fd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:49 -0800 Subject: mm: try to distribute dirty pages fairly across zones The maximum number of dirty pages that exist in the system at any time is determined by a number of pages considered dirtyable and a user-configured percentage of those, or an absolute number in bytes. This number of dirtyable pages is the sum of memory provided by all the zones in the system minus their lowmem reserves and high watermarks, so that the system can retain a healthy number of free pages without having to reclaim dirty pages. But there is a flaw in that we have a zoned page allocator which does not care about the global state but rather the state of individual memory zones. And right now there is nothing that prevents one zone from filling up with dirty pages while other zones are spared, which frequently leads to situations where kswapd, in order to restore the watermark of free pages, does indeed have to write pages from that zone's LRU list. This can interfere so badly with IO from the flusher threads that major filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim already, taking away the VM's only possibility to keep such a zone balanced, aside from hoping the flushers will soon clean pages from that zone. Enter per-zone dirty limits. They are to a zone's dirtyable memory what the global limit is to the global amount of dirtyable memory, and try to make sure that no single zone receives more than its fair share of the globally allowed dirty pages in the first place. As the number of pages considered dirtyable excludes the zones' lowmem reserves and high watermarks, the maximum number of dirty pages in a zone is such that the zone can always be balanced without requiring page cleaning. As this is a placement decision in the page allocator and pages are dirtied only after the allocation, this patch allows allocators to pass __GFP_WRITE when they know in advance that the page will be written to and become dirty soon. The page allocator will then attempt to allocate from the first zone of the zonelist - which on NUMA is determined by the task's NUMA memory policy - that has not exceeded its dirty limit. At first glance, it would appear that the diversion to lower zones can increase pressure on them, but this is not the case. With a full high zone, allocations will be diverted to lower zones eventually, so it is more of a shift in timing of the lower zone allocations. Workloads that previously could fit their dirty pages completely in the higher zone may be forced to allocate from lower zones, but the amount of pages that "spill over" are limited themselves by the lower zones' dirty constraints, and thus unlikely to become a problem. For now, the problem of unfair dirty page distribution remains for NUMA configurations where the zones allowed for allocation are in sum not big enough to trigger the global dirty limits, wake up the flusher threads and remedy the situation. Because of this, an allocation that could not succeed on any of the considered zones is allowed to ignore the dirty limits before going into direct reclaim or even failing the allocation, until a future patch changes the global dirty throttling and flusher thread activation so that they take individual zone states into account. Test results 15M DMA + 3246M DMA32 + 504 Normal = 3765M memory 40% dirty ratio 16G USB thumb drive 10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15)) seconds nr_vmscan_write (stddev) min| median| max xfs vanilla: 549.747( 3.492) 0.000| 0.000| 0.000 patched: 550.996( 3.802) 0.000| 0.000| 0.000 fuse-ntfs vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000 patched: 558.049(17.914) 0.000| 0.000| 43.000 btrfs vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000 patched: 563.365(11.368) 0.000| 0.000| 1362.000 ext4 vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000 patched: 568.806(17.496) 0.000| 0.000| 0.000 Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Tested-by: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 66f172f..581e74b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,6 +36,7 @@ struct vm_area_struct; #endif #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u +#define ___GFP_WRITE 0x1000000u /* * GFP bitmasks.. @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -92,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 34a0055..6dff473 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -124,6 +124,7 @@ void laptop_mode_timer_fn(unsigned long data); static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +bool zone_dirty_ok(struct zone *zone); extern unsigned long global_dirty_limit; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 433fa99..5cdd4f2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -147,6 +147,24 @@ static struct prop_descriptor vm_completions; * clamping level. */ +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -232,6 +250,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) trace_global_dirty_state(background, dirty); } +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + /* + * The effective global number of dirtyable pages may exclude + * highmem as a big-picture measure to keep the ratio between + * dirty memory and lowmem reasonable. + * + * But this function is purely about the individual zone and a + * highmem zone can hold its share of dirty pages, so we don't + * care about vm_highmem_is_dirtyable here. + */ + return zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone) - + zone->dirty_balance_reserve; +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. + */ +static unsigned long zone_dirty_limit(struct zone *zone) +{ + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. + */ +bool zone_dirty_ok(struct zone *zone) +{ + unsigned long limit = zone_dirty_limit(zone); + + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; +} + /* * couple the period to the dirty_ratio: * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2cb9eb7..4f95bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1735,6 +1735,35 @@ zonelist_scan: if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + /* + * When allocating a page cache page for writing, we + * want to get it from a zone that is within its dirty + * limit, such that no single zone holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the zone's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * This may look like it could increase pressure on + * lower zones by failing allocations in higher zones + * before they are full. But the pages that do spill + * over are limited as the lower zones are protected + * by this very same mechanism. It should not become + * a practical burden to them. + * + * XXX: For now, allow allocations to potentially + * exceed the per-zone dirty limit in the slowpath + * (ALLOC_WMARK_LOW unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * zones are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ + if ((alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + goto this_zone_full; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { -- cgit v0.10.2 From 0faa70cb0180d45a06208e54b552a538aabb8a30 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:53 -0800 Subject: mm: filemap: pass __GFP_WRITE from grab_cache_page_write_begin() Tell the page allocator that pages allocated through grab_cache_page_write_begin() are expected to become dirty soon. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/filemap.c b/mm/filemap.c index a0701e6..c4ee2e9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { int status; + gfp_t gfp_mask; struct page *page; gfp_t gfp_notmask = 0; + + gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; if (flags & AOP_FLAG_NOFS) gfp_notmask = __GFP_FS; repeat: @@ -2360,7 +2363,7 @@ repeat: if (page) goto found; - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); + page = __page_cache_alloc(gfp_mask & ~gfp_notmask); if (!page) return NULL; status = add_to_page_cache_lru(page, mapping, index, -- cgit v0.10.2 From e3a41a5ba9c2ab988b9f1442925109dca2382fd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:55 -0800 Subject: btrfs: pass __GFP_WRITE for buffered write page allocations Tell the page allocator that pages allocated for a buffered write are expected to become dirty soon. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Minchan Kim Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe93..20375e6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask); + mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; err = -ENOMEM; -- cgit v0.10.2 From 31b8384a555d94c78e2ea2284a323cb985441f60 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:07:59 -0800 Subject: mm: compaction: push isolate search base of compact control one pfn ahead After isolated the current pfn will no longer be scanned and isolated if the next round is necessary, so push the isolate_migratepages search base of the given compact_control one step ahead. Signed-off-by: Hillf Danton Reviewed-by: Andrea Arcangeli Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 1253d7a..e6670c3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, nr_isolated++; /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + ++low_pfn; break; + } } acct_isolated(zone, cc); -- cgit v0.10.2 From 9571a982903bf9dcbca2479fd3e7dafd2211ecf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 10 Jan 2012 15:08:00 -0800 Subject: bootmem: micro optimize freeing pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first entry of bdata->node_bootmem_map holds the data for bdata->node_min_pfn up to bdata->node_min_pfn + BITS_PER_LONG - 1. So the test for freeing all pages of a single map entry can be slightly relaxed. Moreover use DIV_ROUND_UP in another place instead of open coding it. Signed-off-by: Uwe Kleine-König Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/bootmem.c b/mm/bootmem.c index 1a77012..3e6f152 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = (pages + 7) / 8; + unsigned long bytes = DIV_ROUND_UP(pages, 8); return ALIGN(bytes, sizeof(long)); } @@ -197,7 +197,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + if (aligned && vec == ~0UL && start + BITS_PER_LONG <= end) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); -- cgit v0.10.2 From df0a6daa01fa3856c08f4274d4f21a8092caa480 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2012 15:08:02 -0800 Subject: mm: fix off-by-two in __zone_watermark_ok() Commit 88f5acf88ae6 ("mm: page allocator: adjust the per-cpu counter threshold when memory is low") changed the form how free_pages is calculated but it forgot that we used to do free_pages - ((1 << order) - 1) so we ended up with off-by-two when calculating free_pages. Reported-by: Wang Sheng-Hui Signed-off-by: Michal Hocko Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f95bcf..59153da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1525,7 +1525,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) -- cgit v0.10.2 From 948f017b093a9baac23855fcd920d3a970b71bb6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 10 Jan 2012 15:08:05 -0800 Subject: mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma() migrate was doing an rmap_walk with speculative lock-less access on pagetables. That could lead it to not serializing properly against mremap PT locks. But a second problem remains in the order of vmas in the same_anon_vma list used by the rmap_walk. If vma_merge succeeds in copy_vma, the src vma could be placed after the dst vma in the same_anon_vma list. That could still lead to migrate missing some pte. This patch adds an anon_vma_moveto_tail() function to force the dst vma at the end of the list before mremap starts to solve the problem. If the mremap is very large and there are a lots of parents or childs sharing the anon_vma root lock, this should still scale better than taking the anon_vma root lock around every pte copy practically for the whole duration of mremap. Update: Hugh noticed special care is needed in the error path where move_page_tables goes in the reverse direction, a second anon_vma_moveto_tail() call is needed in the error path. This program exercises the anon_vma_moveto_tail: === int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, SIZE)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); printf("%p\n", p); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); if (memcmp(p, p2, SIZE)) printf("error\n"); p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3); if (p4 != p3) perror("mremap"), exit(1); p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2); if (p4 != p+SIZE/2) perror("mremap"), exit(1); if (memcmp(p, p2, SIZE)) printf("error\n"); printf("ok\n"); return 0; } === $ perf probe -a anon_vma_moveto_tail Add new event: probe:anon_vma_moveto_tail (on anon_vma_moveto_tail) You can now use it on all perf tools, such as: perf record -e probe:anon_vma_moveto_tail -aR sleep 1 $ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail 0x7f2ca2800000 ok [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ] $ perf report --stdio 100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail Signed-off-by: Andrea Arcangeli Reported-by: Nai Xia Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Pawel Sikora Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2148b12..1afb995 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -120,6 +120,7 @@ void anon_vma_init(void); /* create anon_vma_cachep */ int anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); +void anon_vma_moveto_tail(struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); diff --git a/mm/mmap.c b/mm/mmap.c index eae90af..adea3b8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; struct mempolicy *pol; + bool faulted_in_anon_vma = true; /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ - if (!vma->vm_file && !vma->anon_vma) + if (unlikely(!vma->vm_file && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, /* * Source vma may have been merged into new_vma */ - if (vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end) + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; + } else + anon_vma_moveto_tail(new_vma); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index d6959cb..87bb839 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -221,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { /* + * Before moving the page tables from the new vma to + * the old vma, we need to be sure the old vma is + * queued after new vma in the same_anon_vma list to + * prevent SMP races with rmap_walk (that could lead + * rmap_walk to miss some page table). + */ + anon_vma_moveto_tail(vma); + + /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. diff --git a/mm/rmap.c b/mm/rmap.c index a4fd368..a2e5ce1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -272,6 +272,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) } /* + * Some rmap walk that needs to find all ptes/hugepmds without false + * negatives (like migrate and split_huge_page) running concurrent + * with operations that copy or move pagetables (like mremap() and + * fork()) to be safe. They depend on the anon_vma "same_anon_vma" + * list to be in a certain order: the dst_vma must be placed after the + * src_vma in the list. This is always guaranteed by fork() but + * mremap() needs to call this function to enforce it in case the + * dst_vma isn't newly allocated and chained with the anon_vma_clone() + * function but just an extension of a pre-existing vma through + * vma_merge. + * + * NOTE: the same_anon_vma list can still be changed by other + * processes while mremap runs because mremap doesn't hold the + * anon_vma mutex to prevent modifications to the list while it + * runs. All we need to enforce is that the relative order of this + * process vmas isn't changing (we don't care about other vmas + * order). Each vma corresponds to an anon_vma_chain structure so + * there's no risk that other processes calling anon_vma_moveto_tail() + * and changing the same_anon_vma list under mremap() will screw with + * the relative order of this process vmas in the list, because we + * they can't alter the order of any vma that belongs to this + * process. And there can't be another anon_vma_moveto_tail() running + * concurrently with mremap() coming from this process because we hold + * the mmap_sem for the whole mremap(). fork() ordering dependency + * also shouldn't be affected because fork() only cares that the + * parent vmas are placed in the list before the child vmas and + * anon_vma_moveto_tail() won't reorder vmas from either the fork() + * parent or child. + */ +void anon_vma_moveto_tail(struct vm_area_struct *dst) +{ + struct anon_vma_chain *pavc; + struct anon_vma *root = NULL; + + list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = pavc->anon_vma; + VM_BUG_ON(pavc->vma != dst); + root = lock_anon_vma_root(root, anon_vma); + list_del(&pavc->same_anon_vma); + list_add_tail(&pavc->same_anon_vma, &anon_vma->head); + } + unlock_anon_vma_root(root); +} + +/* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. -- cgit v0.10.2 From 6bd4837de96e7d9f9bf33e59117c24fc230862ac Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:08:07 -0800 Subject: mm: simplify find_vma_prev() commit 297c5eee37 ("mm: make the vma list be doubly linked") added the vm_prev member to vm_area_struct. We can simplify find_vma_prev() by using it. Also, this change helps to improve page fault performance because it has stronger locality of reference. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Peter Zijlstra Cc: Shaohua Li Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index adea3b8..3f758c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +/* + * Same as find_vma, but also return a pointer to the previous VMA in *pprev. + * Note: pprev is set to NULL when return value is NULL. + */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { - struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node *rb_node; - if (!mm) - goto out; - - /* Guard against addr being lower than the first VMA */ - vma = mm->mmap; - - /* Go through the RB tree quickly. */ - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *vma_tmp; - vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (addr < vma_tmp->vm_end) { - rb_node = rb_node->rb_left; - } else { - prev = vma_tmp; - if (!prev->vm_next || (addr < prev->vm_next->vm_end)) - break; - rb_node = rb_node->rb_right; - } - } + struct vm_area_struct *vma; -out: - *pprev = prev; - return prev ? prev->vm_next : vma; + vma = find_vma(mm, addr); + *pprev = vma ? vma->vm_prev : NULL; + return vma; } /* -- cgit v0.10.2 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f..aeb135c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include #include #include + +#include #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda..1aab5fe 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 0000000..dd4ba3b --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 0000000..b53add0 --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index b00711c..5e1391b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e2..7c122faa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include #include #include +#include + +#define CREATE_TRACE_POINTS +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; -- cgit v0.10.2 From c3993076f842de3754360e5b998d6657a9d30303 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:10 -0800 Subject: mm: page_alloc: generalize order handling in __free_pages_bootmem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __free_pages_bootmem() used to special-case higher-order frees to save individual page checking with free_pages_bulk(). Nowadays, both zero order and non-zero order frees use free_pages(), which checks each individual page anyway, and so there is little point in making the distinction anymore. The higher-order loop will work just fine for zero order pages. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59153da..794e671 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -730,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * permit the bootmem allocator to evade page validation on high-order frees - */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { - if (order == 0) { - __ClearPageReserved(page); - set_page_count(page, 0); - set_page_refcounted(page); - __free_page(page); - } else { - int loop; - - prefetchw(page); - for (loop = 0; loop < (1 << order); loop++) { - struct page *p = &page[loop]; + unsigned int nr_pages = 1 << order; + unsigned int loop; - if (loop + 1 < (1 << order)) - prefetchw(p + 1); - __ClearPageReserved(p); - set_page_count(p, 0); - } + prefetchw(page); + for (loop = 0; loop < nr_pages; loop++) { + struct page *p = &page[loop]; - set_page_refcounted(page); - __free_pages(page, order); + if (loop + 1 < nr_pages) + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); } + + set_page_refcounted(page); + __free_pages(page, order); } -- cgit v0.10.2 From 560a036b3a3733e33424385c0a0c799dee454d05 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:13 -0800 Subject: mm: bootmem: drop superfluous range check when freeing pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The area node_bootmem_map represents is aligned to BITS_PER_LONG, and all bits in any aligned word of that map valid. When the represented area extends beyond the end of the node, the non-existant pages will be marked as reserved. As a result, when freeing a page block, doing an explicit range check for whether that block is within the node's range is redundant as the bitmap is consulted anyway to see whether all pages in the block are unreserved. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/bootmem.c b/mm/bootmem.c index 3e6f152..1aea171 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -197,7 +197,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - if (aligned && vec == ~0UL && start + BITS_PER_LONG <= end) { + if (aligned && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); -- cgit v0.10.2 From 799f933a82d878d7f15215473c5561ce984ada75 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:15 -0800 Subject: mm: bootmem: try harder to free pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The loop that frees pages to the page allocator while bootstrapping tries to free higher-order blocks only when the starting address is aligned to that block size. Otherwise it will free all pages on that node one-by-one. Change it to free individual pages up to the first aligned block and then try higher-order frees from there. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/bootmem.c b/mm/bootmem.c index 1aea171..668e94d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { - int aligned; struct page *page; unsigned long start, end, pages, count = 0; @@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) start = bdata->node_min_pfn; end = bdata->node_low_pfn; - /* - * If the start is aligned to the machines wordsize, we might - * be able to free pages in bulks of that order. - */ - aligned = !(start & (BITS_PER_LONG - 1)); - - bdebug("nid=%td start=%lx end=%lx aligned=%d\n", - bdata - bootmem_node_data, start, end, aligned); + bdebug("nid=%td start=%lx end=%lx\n", + bdata - bootmem_node_data, start, end); while (start < end) { unsigned long *map, idx, vec; @@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - - if (aligned && vec == ~0UL) { + /* + * If we have a properly aligned and fully unreserved + * BITS_PER_LONG block of pages in front of us, free + * it in one go. + */ + if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); count += BITS_PER_LONG; + start += BITS_PER_LONG; } else { unsigned long off = 0; @@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) vec >>= 1; off++; } + start = ALIGN(start + 1, BITS_PER_LONG); } - start += BITS_PER_LONG; } page = virt_to_page(bdata->node_bootmem_map); -- cgit v0.10.2 From 86cfd3a45042ab242d47f3935a02811a402beab6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2012 15:08:18 -0800 Subject: mm/vmscan.c: consider swap space when deciding whether to continue reclaim It's pointless to continue reclaiming when we have no swap space and lots of anon pages in the inactive list. Without this patch, it is possible when swap is disabled to continue trying to reclaim when there are only anonymous pages in the system even though that will not make any progress. Signed-off-by: Minchan Kim Cc: KOSAKI Motohiro Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 974162c..b935e6f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2000,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (nr_swap_pages > 0) + inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; -- cgit v0.10.2 From 0c176d52b0b2619f231b2bbf329b90c028134f58 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:19 -0800 Subject: mm: hugetlb: fix pgoff computation when unmapping page from vma The computation for pgoff is incorrect, at least with (vma->vm_pgoff >> PAGE_SHIFT) involved. It is fixed with the available method if HPAGE_SIZE is concerned in page cache lookup. [akpm@linux-foundation.org: use vma_hugecache_offset() directly, per Michal] Signed-off-by: Hillf Danton Cc: Mel Gorman Cc: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab89d6f..bb7dc40 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) - + (vma->vm_pgoff >> PAGE_SHIFT); + pgoff = vma_hugecache_offset(h, vma, address); mapping = (struct address_space *)page_private(page); /* -- cgit v0.10.2 From fcfb4dcc9698f932836aa63ba0d82e7dbd300fb3 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:08:21 -0800 Subject: mm/mempolicy.c: mpol_equal(): use bool mpol_equal() logically returns a boolean. Use a bool type to slightly improve readability. Signed-off-by: KOSAKI Motohiro Cc: Stephen Wilson Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7978eec..7c727a9 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -164,11 +164,11 @@ static inline void mpol_get(struct mempolicy *pol) atomic_inc(&pol->refcnt); } -extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b); -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) - return 1; + return true; return __mpol_equal(a, b); } @@ -257,9 +257,9 @@ static inline int vma_migratable(struct vm_area_struct *vma) struct mempolicy {}; -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { - return 1; + return true; } static inline void mpol_put(struct mempolicy *p) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c3fdbcb..e3d58f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, } /* Slow path of a mempolicy comparison */ -int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) - return 0; + return false; if (a->mode != b->mode) - return 0; + return false; if (a->flags != b->flags) - return 0; + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) - return 0; + return false; switch (a->mode) { case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - return nodes_equal(a->v.nodes, b->v.nodes); + return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; default: BUG(); - return 0; + return false; } } -- cgit v0.10.2 From 564c81db19f3630f53a14bbceb7b85eb9660ded3 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 10 Jan 2012 15:08:22 -0800 Subject: mm/migrate.c: cleanup comment for migration_entry_wait() migration_entry_wait() can also be called from hugetlb_fault() now. Remove the incorrect comment. Signed-off-by: Wang Sheng-Hui Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 594dc37..670bb89 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -181,8 +181,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. - * - * This function is called from do_swap_page(). */ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -- cgit v0.10.2 From 5b990546e33477c34ee6fbc20fad6584386b46c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:23 -0800 Subject: mempool: fix and document synchronization and memory barrier usage mempool_alloc/free() use undocumented smp_mb()'s. The code is slightly broken and misleading. The lockless part is in mempool_free(). It wants to determine whether the item being freed needs to be returned to the pool or backing allocator without grabbing pool->lock. Two things need to be guaranteed for correct operation. 1. pool->curr_nr + #allocated should never dip below pool->min_nr. 2. Waiters shouldn't be left dangling. For #1, The only necessary condition is that curr_nr visible at free is from after the allocation of the element being freed (details in the comment). For most cases, this is true without any barrier but there can be fringe cases where the allocated pointer is passed to the freeing task without going through memory barriers. To cover this case, wmb is necessary before returning from allocation and rmb is necessary before reading curr_nr. IOW, ALLOCATING TASK FREEING TASK update pool state after alloc; wmb(); pass pointer to freeing task; read pointer; rmb(); read pool state to free; The current code doesn't have wmb after pool update during allocation and may theoretically, on machines where unlock doesn't behave as full wmb, lead to pool depletion and deadlock. smp_wmb() needs to be added after successful allocation from reserved elements and smp_mb() in mempool_free() can be replaced with smp_rmb(). For #2, the waiter needs to add itself to waitqueue and then check the wait condition and the waker needs to update the wait condition and then wake up. Because waitqueue operations always go through full spinlock synchronization, there is no need for extra memory barriers. Furthermore, mempool_alloc() is already holding pool->lock when it decides that it needs to wait. There is no reason to do unlock - add waitqueue - test condition again. It can simply add itself to waitqueue while holding pool->lock and then unlock and sleep. This patch adds smp_wmb() after successful allocation from reserved pool, replaces smp_mb() in mempool_free() with smp_rmb() and extend pool->lock over waitqueue addition. More importantly, it explains what memory barriers do and how the lockless testing is correct. -v2: Oleg pointed out that unlock doesn't imply wmb. Added explicit smp_wmb() after successful allocation from reserved pool and updated comments accordingly. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempool.c b/mm/mempool.c index e73641b..11f0d0a 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -224,28 +224,31 @@ repeat_alloc: if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); return element; } - spin_unlock_irqrestore(&pool->lock, flags); /* We must not sleep in the GFP_ATOMIC case */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_WAIT)) { + spin_unlock_irqrestore(&pool->lock, flags); return NULL; + } - /* Now start performing page reclaim */ + /* Let's wait for someone else to return an element to @pool */ gfp_temp = gfp_mask; init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); - if (!pool->curr_nr) { - /* - * FIXME: this should be io_schedule(). The timeout is there - * as a workaround for some DM problems in 2.6.18. - */ - io_schedule_timeout(5*HZ); - } - finish_wait(&pool->wait, &wait); + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc); @@ -265,7 +268,39 @@ void mempool_free(void *element, mempool_t *pool) if (unlikely(element == NULL)) return; - smp_mb(); + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ if (pool->curr_nr < pool->min_nr) { spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { -- cgit v0.10.2 From 0565d317768cc66b13e37184f29d9f270c2886dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:26 -0800 Subject: mempool: drop unnecessary and incorrect BUG_ON() from mempool_destroy() mempool_destroy() is a thin wrapper around free_pool(). The only thing it adds is BUG_ON(pool->curr_nr != pool->min_nr). The intention seems to be to enforce that all allocated elements are freed; however, the BUG_ON() can't achieve that (it doesn't know anything about objects above min_nr) and incorrect as mempool_resize() is allowed to leave the pool extended but not filled. Furthermore, panicking is way worse than any memory leak and there are better debug tools to track memory leaks. Drop the BUG_ON() from mempool_destory() and as that leaves the function identical to free_pool(), replace it. Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempool.c b/mm/mempool.c index 11f0d0a..e3a802a 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool) return pool->elements[--pool->curr_nr]; } -static void free_pool(mempool_t *pool) +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool) kfree(pool->elements); kfree(pool); } +EXPORT_SYMBOL(mempool_destroy); /** * mempool_create - create a memory pool @@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, element = pool->alloc(GFP_KERNEL, pool->pool_data); if (unlikely(!element)) { - free_pool(pool); + mempool_destroy(pool); return NULL; } add_element(pool, element); @@ -172,23 +181,6 @@ out: EXPORT_SYMBOL(mempool_resize); /** - * mempool_destroy - deallocate a memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * - * this function only sleeps if the free_fn() function sleeps. The caller - * has to guarantee that all elements have been returned to the pool (ie: - * freed) prior to calling mempool_destroy(). - */ -void mempool_destroy(mempool_t *pool) -{ - /* Check for outstanding elements */ - BUG_ON(pool->curr_nr != pool->min_nr); - free_pool(pool); -} -EXPORT_SYMBOL(mempool_destroy); - -/** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). -- cgit v0.10.2 From 1ebb7044c9142c67d1d2b04d84010b4810a43fd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:28 -0800 Subject: mempool: fix first round failure behavior mempool modifies gfp_mask so that the backing allocator doesn't try too hard or trigger warning message when there's pool to fall back on. In addition, for the first try, it removes __GFP_WAIT and IO, so that it doesn't trigger reclaim or wait when allocation can be fulfilled from pool; however, when that allocation fails and pool is empty too, it waits for the pool to be replenished before retrying. Allocation which could have succeeded after a bit of reclaim has to wait on the reserved items and it's not like mempool doesn't retry with __GFP_WAIT and IO. It just does that *after* someone returns an element, pointlessly delaying things. Fix it by retrying immediately if the first round of allocation attempts w/o __GFP_WAIT and IO fails. [akpm@linux-foundation.org: shorten the lock hold time] Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempool.c b/mm/mempool.c index e3a802a..d904981 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -221,14 +221,23 @@ repeat_alloc: return element; } - /* We must not sleep in the GFP_ATOMIC case */ + /* + * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* We must not sleep if !__GFP_WAIT */ if (!(gfp_mask & __GFP_WAIT)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } /* Let's wait for someone else to return an element to @pool */ - gfp_temp = gfp_mask; init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); -- cgit v0.10.2 From ea5768c74b8e0d6a866508fc6399d5ff958da5e3 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:30 -0800 Subject: mm/hugetlb.c: avoid bogus counter of surplus huge page If we have to hand back the newly allocated huge page to page allocator, for any reason, the changed counter should be recovered. This affects only s390 at present. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb7dc40..ea8c3a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); - return NULL; + page = NULL; } spin_lock(&hugetlb_lock); -- cgit v0.10.2 From faed836a2371a96901057f310e436a09eded94fd Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 10 Jan 2012 15:08:32 -0800 Subject: mm/migrate.c: remove the unused macro lru_to_page lru_to_page is not used in mm/migrate.c. Signed-off-by: Wang Sheng-Hui Acked-by: Mel Gorman Acked-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index 670bb89..89ea085 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,8 +39,6 @@ #include "internal.h" -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - /* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is -- cgit v0.10.2 From 043bcbe5ec51e0478ef2b44acef17193e01d7f70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 10 Jan 2012 15:08:33 -0800 Subject: mm: test PageSwapBacked in lumpy reclaim Lumpy reclaim does well to stop at a PageAnon when there's no swap, but better is to stop at any PageSwapBacked, which includes shmem/tmpfs too. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index b935e6f..8a4e767 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1166,7 +1166,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * anon page which don't already have a swap slot is * pointless. */ - if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && !PageSwapCache(cursor_page)) break; -- cgit v0.10.2 From 3770490ec82ca63d5fdcebeb95f2f68af2626357 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:36 -0800 Subject: mm: vmscan: fix typo in isolating lru pages It is not the tag page but the cursor page that we should process, and it looks a typo. Signed-off-by: Hillf Danton Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Cc: Hugh Dickins Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 8a4e767..26f4a8a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1173,7 +1173,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(page); + nr_taken += hpage_nr_pages(cursor_page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; -- cgit v0.10.2 From db1aecafef58b5dda39c4228debe2c845e4a27ab Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2012 15:08:39 -0800 Subject: mm/vmalloc.c: change void* into explict vm_struct* vmap_area->private is void* but we don't use the field for various purpose but use only for vm_struct. So change it to a vm_struct* with naming to improve for readability and type checking. Signed-off-by: Minchan Kim Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 21fdf46..877ca04 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -256,7 +256,7 @@ struct vmap_area { struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct list_head purge_list; /* "lazy purge" list */ - void *private; + struct vm_struct *vm; struct rcu_head rcu_head; }; @@ -1285,7 +1285,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; - va->private = vm; + va->vm = vm; va->flags |= VM_VM_AREA; } @@ -1408,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) - return va->private; + return va->vm; return NULL; } @@ -1427,7 +1427,7 @@ struct vm_struct *remove_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { - struct vm_struct *vm = va->private; + struct vm_struct *vm = va->vm; if (!(vm->flags & VM_UNLIST)) { struct vm_struct *tmp, **p; -- cgit v0.10.2 From ed128fea3bcbce728c9c81b2e45ec3921911bfb6 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 10 Jan 2012 15:08:41 -0800 Subject: get_maintainers.pl: follow renames when looking up commit signers I happen to have had a commit to various network drivers since the big renaming/reorg which happened to drivers/net recently. This means that I now appear to be in the top few commit signers (by %age) for many of them so am getting sent all sorts of stuff and people who are involved with the driver are not. e.g. (to pick one at random): $ ./scripts/get_maintainer.pl -f drivers/net/ethernet/nvidia/forcedeth.c "David S. Miller" (commit_signer:5/7=71%) Ian Campbell (commit_signer:2/7=29%) Eric Dumazet (commit_signer:1/7=14%) Jeff Kirsher (commit_signer:1/7=14%) Jiri Pirko (commit_signer:1/7=14%) netdev@vger.kernel.org (open list:NETWORKING DRIVERS) linux-kernel@vger.kernel.org (open list) With the following patch the renames are followed and the result appears much more sensible: $ ./scripts/get_maintainer.pl -f drivers/net/ethernet/nvidia/forcedeth.c "David S. Miller" (commit_signer:31/34=91%) Joe Perches (commit_signer:11/34=32%) Szymon Janc (commit_signer:5/34=15%) Jiri Pirko (commit_signer:3/34=9%) Paul (commit_signer:2/34=6%) netdev@vger.kernel.org (open list:NETWORKING DRIVERS) linux-kernel@vger.kernel.org (open list) Signed-off-by: Ian Campbell Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 4594f33..f32a04c 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -95,7 +95,7 @@ my %VCS_cmds_git = ( "execute_cmd" => \&git_execute_cmd, "available" => '(which("git") ne "") && (-d ".git")', "find_signers_cmd" => - "git log --no-color --since=\$email_git_since " . + "git log --no-color --follow --since=\$email_git_since " . '--format="GitCommit: %H%n' . 'GitAuthor: %an <%ae>%n' . 'GitDate: %aD%n' . -- cgit v0.10.2 From 38f1b4c53826f3ac7e1b17c04a2dcdc802fb0785 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:42 -0800 Subject: MAINTAINERS: update various arm F: patterns Track renames and missing or deleted files. Signed-off-by: Joe Perches Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 0ae41c9..2001356 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -914,7 +914,6 @@ M: Lennert Buytenhek M: Nicolas Pitre L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Odd Fixes -F: arch/arm/mach-loki/ F: arch/arm/mach-kirkwood/ F: arch/arm/mach-mv78xx0/ F: arch/arm/mach-orion5x/ @@ -1076,8 +1075,8 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-s5pv210/mach-aquila.c F: arch/arm/mach-s5pv210/mach-goni.c -F: arch/arm/mach-exynos4/mach-universal_c210.c -F: arch/arm/mach-exynos4/mach-nuri.c +F: arch/arm/mach-exynos/mach-universal_c210.c +F: arch/arm/mach-exynos/mach-nuri.c ARM/SAMSUNG S5P SERIES FIMC SUPPORT M: Kyungmin Park @@ -1105,7 +1104,6 @@ M: Tomasz Stanislawski L: linux-arm-kernel@lists.infradead.org L: linux-media@vger.kernel.org S: Maintained -F: arch/arm/plat-s5p/dev-tv.c F: drivers/media/video/s5p-tv/ ARM/SHMOBILE ARM ARCHITECTURE @@ -1140,7 +1138,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.mcuos.com S: Maintained F: arch/arm/mach-w90x900/ -F: arch/arm/mach-nuc93x/ F: drivers/input/keyboard/w90p910_keypad.c F: drivers/input/touchscreen/w90p910_ts.c F: drivers/watchdog/nuc900_wdt.c @@ -6180,9 +6177,7 @@ M: Viresh Kumar W: http://www.st.com/spear S: Maintained F: arch/arm/mach-spear*/clock.c -F: arch/arm/mach-spear*/include/mach/clkdev.h F: arch/arm/plat-spear/clock.c -F: arch/arm/plat-spear/include/plat/clkdev.h F: arch/arm/plat-spear/include/plat/clock.h SPEAR PAD MULTIPLEXING SUPPORT -- cgit v0.10.2 From 77278d50e04bfb57076eb50cf8c5f898f933bf84 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:44 -0800 Subject: MAINTAINERS: update adp gpio F: patterns Commit c103de240439df ("gpio: reorganize drivers") renamed the files, update the patterns. Signed-off-by: Joe Perches Acked-by: Grant Likely Acked-by: Michael Hennerich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 2001356..c506575 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -342,7 +342,7 @@ S: Supported F: drivers/mfd/adp5520.c F: drivers/video/backlight/adp5520_bl.c F: drivers/leds/leds-adp5520.c -F: drivers/gpio/adp5520-gpio.c +F: drivers/gpio/gpio-adp5520.c F: drivers/input/keyboard/adp5520-keys.c ADP5588 QWERTY KEYPAD AND IO EXPANDER DRIVER (ADP5588/ADP5587) @@ -351,7 +351,7 @@ L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADP5588 S: Supported F: drivers/input/keyboard/adp5588-keys.c -F: drivers/gpio/adp5588-gpio.c +F: drivers/gpio/gpio-adp5588.c ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863) M: Michael Hennerich -- cgit v0.10.2 From 72dbb7051334c37c9210cd735684c304da8a5e85 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:46 -0800 Subject: MAINTAINERS: update bt8xx gpio F: patterns Commit c103de240439d ("gpio: reorganize drivers") renamed the file, update the pattern. Signed-off-by: Joe Perches Cc: Grant Likely Cc: Michael Buesch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index c506575..04716db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1618,7 +1618,7 @@ BT8XXGPIO DRIVER M: Michael Buesch W: http://bu3sch.de/btgpio.php S: Maintained -F: drivers/gpio/bt8xxgpio.c +F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason -- cgit v0.10.2 From 25b8d2b4fc4fd9f9ae7f95ce76bc47712c99809e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:49 -0800 Subject: MAINTAINERS: update marvell ccic F: patterns Commit f8fc729870ee ("[media] marvell-cam: Move cafe-ccic into its own directory") moved the files, update the pattern. Signed-off-by: Joe Perches Cc: Jonathan Corbet Acked-by: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 04716db..75da19d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1659,7 +1659,7 @@ L: linux-media@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git S: Maintained F: Documentation/video4linux/cafe_ccic -F: drivers/media/video/cafe_ccic* +F: drivers/media/video/marvell-ccic/ CAIF NETWORK LAYER M: Sjur Braendeland -- cgit v0.10.2 From d8f663561b185101c5b97b55d0f6aad49671d4e3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:51 -0800 Subject: MAINTAINERS: update mfd F: patterns commit 8959e74399c ("mfd: Delete ab3550 driver") removed the driver, update the patterns. Signed-off-by: Joe Perches Acked-by: Linus Walleij Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 75da19d..1d20e3f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1169,7 +1169,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ux500/ F: drivers/dma/ste_dma40* -F: drivers/mfd/ab3550* F: drivers/mfd/abx500* F: drivers/mfd/ab8500* F: drivers/mfd/stmpe* -- cgit v0.10.2 From d4a45787afd22316dc9ee9129a58796100621cb5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:54 -0800 Subject: MAINTAINERS: update sdhci F: patterns commit 38576af1f8c ("mmc: sdhci: make sdhci-of device drivers self registered") moved the files around. Update the patterns. Signed-off-by: Joe Perches Cc: Shawn Guo Cc: Chris Ball Acked-by: Anton Vorontsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 1d20e3f..f72a3ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5791,13 +5791,14 @@ L: linux-mmc@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/cjb/mmc.git S: Maintained F: drivers/mmc/host/sdhci.* +F: drivers/mmc/host/sdhci-pltfm.[ch] SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF) M: Anton Vorontsov L: linuxppc-dev@lists.ozlabs.org L: linux-mmc@vger.kernel.org S: Maintained -F: drivers/mmc/host/sdhci-of.* +F: drivers/mmc/host/sdhci-pltfm.[ch] SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) SAMSUNG DRIVER M: Ben Dooks -- cgit v0.10.2 From 0f04e2aa0cbe10f06326cd7f98aaf0012d9c6038 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:56 -0800 Subject: MAINTAINERS: update tulip F: patterns commit a88394cfb58 ("ewrk3/tulip: Move the DEC - Tulip drivers") moved the files, update the patterns. Signed-off-by: Joe Perches Acked-by: Grant Grundler Cc: Jeff Kirsher Cc: Tobias Ringstrom Cc: Grant Grundler Cc: David Davies Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index f72a3ef..f952e00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2096,7 +2096,7 @@ DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan F: Documentation/networking/dmfe.txt -F: drivers/net/ethernet/tulip/dmfe.c +F: drivers/net/ethernet/dec/tulip/dmfe.c DC390/AM53C974 SCSI driver M: Kurt Garloff @@ -6640,7 +6640,7 @@ TULIP NETWORK DRIVERS M: Grant Grundler L: netdev@vger.kernel.org S: Maintained -F: drivers/net/ethernet/tulip/ +F: drivers/net/ethernet/dec/tulip/ TUN/TAP driver M: Maxim Krasnyansky -- cgit v0.10.2 From a31a96ad7206df554f1d1571b986abbe742d8b8e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:58 -0800 Subject: MAINTAINERS: update greth F: patterns commit 1fe003fd424 ("greth: Move the Aeroflex Gaisler driver") moved the files, update the patterns. Signed-off-by: Joe Perches Cc: Kristoffer Glembo Cc: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index f952e00..abb632e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2906,7 +2906,7 @@ GRETH 10/100/1G Ethernet MAC device driver M: Kristoffer Glembo L: netdev@vger.kernel.org S: Maintained -F: drivers/net/greth* +F: drivers/net/ethernet/aeroflex/ GSPCA FINEPIX SUBDRIVER M: Frank Zago -- cgit v0.10.2 From 19c90aa678a166381609af574d2a993568f5f5bb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:00 -0800 Subject: MAINTAINERS: update encrypted-keys F: patterns commit 61cf45d0199 ("encrypted-keys: create encrypted-keys directory") moved the files, update the patterns. Signed-off-by: Joe Perches Cc: Mimi Zohar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index abb632e..dbf1676 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3856,8 +3856,7 @@ L: keyrings@linux-nfs.org S: Supported F: Documentation/security/keys-trusted-encrypted.txt F: include/keys/encrypted-type.h -F: security/keys/encrypted.c -F: security/keys/encrypted.h +F: security/keys/encrypted-keys/ KGDB / KDB /debug_core M: Jason Wessel -- cgit v0.10.2 From b2b0186d0f98e7fc2c21a2a3514223ab248e46f6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:01 -0800 Subject: MAINTAINERS: staging: media: update F: patterns commit 4860c73804c ("staging: Move media drivers to staging/media") moved the files, update the F: patterns. Signed-off-by: Joe Perches Acked-by: Mauro Carvalho Chehab Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index dbf1676..4308317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6300,7 +6300,7 @@ STAGING - LIRC (LINUX INFRARED REMOTE CONTROL) DRIVERS M: Jarod Wilson W: http://www.lirc.org/ S: Odd Fixes -F: drivers/staging/lirc/ +F: drivers/staging/media/lirc/ STAGING - NVIDIA COMPLIANT EMBEDDED CONTROLLER INTERFACE (nvec) M: Julian Andres Klode @@ -6336,7 +6336,7 @@ F: drivers/staging/sm7xx/ STAGING - SOFTLOGIC 6x10 MPEG CODEC M: Ben Collins S: Odd Fixes -F: drivers/staging/solo6x10/ +F: drivers/staging/media/solo6x10/ STAGING - SPEAKUP CONSOLE SPEECH DRIVER M: William Hubbs -- cgit v0.10.2 From 8460241e4477db699135ad0521e1293258a8baaa Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:04 -0800 Subject: MAINTAINERS: serial:blackfin: update F: pattern commit 0c6967b5a0 ("serial:blackfin: rename Blackfin serial driver to bfin_uart.c") renamed the file, update the pattern. Signed-off-by: Joe Perches Acked-by: Sonic Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 4308317..7510b63 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1487,7 +1487,7 @@ M: Sonic Zhang L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported -F: drivers/tty/serial/bfin_5xx.c +F: drivers/tty/serial/bfin_uart.c BLACKFIN WATCHDOG DRIVER M: Mike Frysinger -- cgit v0.10.2 From 9df92e6c770e2709f6e3080a4a82d71953267f05 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:06 -0800 Subject: MAINTAINERS: spi: update F: patterns commit ca632f55669 ("spi: reorganize drivers") renamed the files, update the F: patterns. Signed-off-by: Joe Perches Acked-by: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 7510b63..e4eeb9b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1144,7 +1144,7 @@ F: drivers/watchdog/nuc900_wdt.c F: drivers/net/ethernet/nuvoton/w90p910_ether.c F: drivers/mtd/nand/nuc900_nand.c F: drivers/rtc/rtc-nuc900.c -F: drivers/spi/spi_nuc900.c +F: drivers/spi/spi-nuc900.c F: drivers/usb/host/ehci-w90x900.c F: drivers/video/nuc900fb.c @@ -1348,7 +1348,7 @@ F: drivers/net/ethernet/cadence/ ATMEL SPI DRIVER M: Nicolas Ferre S: Supported -F: drivers/spi/atmel_spi.* +F: drivers/spi/spi-atmel.* ATMEL USBA UDC DRIVER M: Nicolas Ferre @@ -5308,7 +5308,7 @@ T: git git://git.linaro.org/people/ycmiao/pxa-linux.git S: Maintained F: arch/arm/mach-pxa/ F: drivers/pcmcia/pxa2xx* -F: drivers/spi/pxa2xx* +F: drivers/spi/spi-pxa2xx* F: drivers/usb/gadget/pxa2* F: include/sound/pxa2xx-lib.h F: sound/arm/pxa* -- cgit v0.10.2 From 89d07767d051c9713b4d79c387c1eadd085c30f8 Mon Sep 17 00:00:00 2001 From: Kyungmin Park Date: Tue, 10 Jan 2012 15:09:09 -0800 Subject: devfreq: add devfreq maintainer entry As devfreq is merged at mainline. Also update the maintainer entry. Signed-off-by: Kyungmin Park Cc: Kevin Hilman Cc: MyungJoo Ham Acked-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index e4eeb9b..cf6b2d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2169,6 +2169,13 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/dwc3/ +DEVICE FREQUENCY (DEVFREQ) +M: MyungJoo Ham +M: Kyungmin Park +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/devfreq/ + DEVICE NUMBER REGISTRY M: Torben Mathiasen W: http://lanana.org/docs/device-list/index.html -- cgit v0.10.2 From 3ed0c15fd1032c6a75aba804a200d4acc5aeb72e Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 10 Jan 2012 15:09:10 -0800 Subject: backlight: remove ADX backlight device support Support for the Avionic Design Xanthos backlight device got added in commit 3b96ea9ef8 ("backlight: Add support for the Avionic Design Xanthos backlight device."). That support depends on ARCH_PXA_ADX. The code that should have provided that Kconfig symbol never got submitted. It has never been possible to even build this driver. Remove it. Signed-off-by: Paul Bolle Acked-by: Thierry Reding Cc: Richard Purdie Cc: Wim Van Sebroeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 278aeaa..681b369 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -280,14 +280,6 @@ config BACKLIGHT_WM831X If you have a backlight driven by the ISINK and DCDC of a WM831x PMIC say y to enable the backlight driver for it. -config BACKLIGHT_ADX - tristate "Avionic Design Xanthos Backlight Driver" - depends on ARCH_PXA_ADX - default y - help - Say Y to enable the backlight driver on Avionic Design Xanthos-based - boards. - config BACKLIGHT_ADP5520 tristate "Backlight Driver for ADP5520/ADP5501 using WLED" depends on PMIC_ADP5520 diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index fdd1fc4..af5cf65 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_BACKLIGHT_APPLE) += apple_bl.o obj-$(CONFIG_BACKLIGHT_TOSA) += tosa_bl.o obj-$(CONFIG_BACKLIGHT_SAHARA) += kb3886_bl.o obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o -obj-$(CONFIG_BACKLIGHT_ADX) += adx_bl.o obj-$(CONFIG_BACKLIGHT_ADP5520) += adp5520_bl.o obj-$(CONFIG_BACKLIGHT_ADP8860) += adp8860_bl.o obj-$(CONFIG_BACKLIGHT_ADP8870) += adp8870_bl.o diff --git a/drivers/video/backlight/adx_bl.c b/drivers/video/backlight/adx_bl.c deleted file mode 100644 index c861c41..0000000 --- a/drivers/video/backlight/adx_bl.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * linux/drivers/video/backlight/adx.c - * - * Copyright (C) 2009 Avionic Design GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Written by Thierry Reding - */ - -#include -#include -#include -#include -#include -#include - -/* register definitions */ -#define ADX_BACKLIGHT_CONTROL 0x00 -#define ADX_BACKLIGHT_CONTROL_ENABLE (1 << 0) -#define ADX_BACKLIGHT_BRIGHTNESS 0x08 -#define ADX_BACKLIGHT_STATUS 0x10 -#define ADX_BACKLIGHT_ERROR 0x18 - -struct adxbl { - void __iomem *base; -}; - -static int adx_backlight_update_status(struct backlight_device *bldev) -{ - struct adxbl *bl = bl_get_data(bldev); - u32 value; - - value = bldev->props.brightness; - writel(value, bl->base + ADX_BACKLIGHT_BRIGHTNESS); - - value = readl(bl->base + ADX_BACKLIGHT_CONTROL); - - if (bldev->props.state & BL_CORE_FBBLANK) - value &= ~ADX_BACKLIGHT_CONTROL_ENABLE; - else - value |= ADX_BACKLIGHT_CONTROL_ENABLE; - - writel(value, bl->base + ADX_BACKLIGHT_CONTROL); - - return 0; -} - -static int adx_backlight_get_brightness(struct backlight_device *bldev) -{ - struct adxbl *bl = bl_get_data(bldev); - u32 brightness; - - brightness = readl(bl->base + ADX_BACKLIGHT_BRIGHTNESS); - return brightness & 0xff; -} - -static int adx_backlight_check_fb(struct backlight_device *bldev, struct fb_info *fb) -{ - return 1; -} - -static const struct backlight_ops adx_backlight_ops = { - .options = 0, - .update_status = adx_backlight_update_status, - .get_brightness = adx_backlight_get_brightness, - .check_fb = adx_backlight_check_fb, -}; - -static int __devinit adx_backlight_probe(struct platform_device *pdev) -{ - struct backlight_properties props; - struct backlight_device *bldev; - struct resource *res; - struct adxbl *bl; - int ret = 0; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - ret = -ENXIO; - goto out; - } - - res = devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), res->name); - if (!res) { - ret = -ENXIO; - goto out; - } - - bl = devm_kzalloc(&pdev->dev, sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto out; - } - - bl->base = devm_ioremap_nocache(&pdev->dev, res->start, - resource_size(res)); - if (!bl->base) { - ret = -ENXIO; - goto out; - } - - memset(&props, 0, sizeof(struct backlight_properties)); - props.type = BACKLIGHT_RAW; - props.max_brightness = 0xff; - bldev = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, - bl, &adx_backlight_ops, &props); - if (IS_ERR(bldev)) { - ret = PTR_ERR(bldev); - goto out; - } - - bldev->props.brightness = 0xff; - bldev->props.power = FB_BLANK_UNBLANK; - - platform_set_drvdata(pdev, bldev); - -out: - return ret; -} - -static int __devexit adx_backlight_remove(struct platform_device *pdev) -{ - struct backlight_device *bldev; - int ret = 0; - - bldev = platform_get_drvdata(pdev); - bldev->props.power = FB_BLANK_UNBLANK; - bldev->props.brightness = 0xff; - backlight_update_status(bldev); - backlight_device_unregister(bldev); - platform_set_drvdata(pdev, NULL); - - return ret; -} - -#ifdef CONFIG_PM -static int adx_backlight_suspend(struct platform_device *pdev, - pm_message_t state) -{ - return 0; -} - -static int adx_backlight_resume(struct platform_device *pdev) -{ - return 0; -} -#else -#define adx_backlight_suspend NULL -#define adx_backlight_resume NULL -#endif - -static struct platform_driver adx_backlight_driver = { - .probe = adx_backlight_probe, - .remove = __devexit_p(adx_backlight_remove), - .suspend = adx_backlight_suspend, - .resume = adx_backlight_resume, - .driver = { - .name = "adx-backlight", - .owner = THIS_MODULE, - }, -}; - -static int __init adx_backlight_init(void) -{ - return platform_driver_register(&adx_backlight_driver); -} - -static void __exit adx_backlight_exit(void) -{ - platform_driver_unregister(&adx_backlight_driver); -} - -module_init(adx_backlight_init); -module_exit(adx_backlight_exit); - -MODULE_AUTHOR("Thierry Reding "); -MODULE_DESCRIPTION("Avionic Design Xanthos Backlight Driver"); -MODULE_LICENSE("GPL v2"); -- cgit v0.10.2 From 81178e021689bf86c328f144aa0f0e1b50f5e94c Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:11 -0800 Subject: backlight: convert drivers/video/backlight/* to use module_platform_driver() Convert the drivers in drivers/video/backlight/* to use the module_platform_driver() macro which makes the code smaller and a bit simpler. Signed-off-by: Axel Lin Acked-by: Haojian Zhuang Acked-by: H Hartley Sweeten [ep93xx_bl.c] Cc: Mike Rapoport Cc: Richard Purdie Acked-by: Michael Hennerich Acked-by: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/88pm860x_bl.c b/drivers/video/backlight/88pm860x_bl.c index 1105fa1..a1376dc 100644 --- a/drivers/video/backlight/88pm860x_bl.c +++ b/drivers/video/backlight/88pm860x_bl.c @@ -270,17 +270,7 @@ static struct platform_driver pm860x_backlight_driver = { .remove = pm860x_backlight_remove, }; -static int __init pm860x_backlight_init(void) -{ - return platform_driver_register(&pm860x_backlight_driver); -} -module_init(pm860x_backlight_init); - -static void __exit pm860x_backlight_exit(void) -{ - platform_driver_unregister(&pm860x_backlight_driver); -} -module_exit(pm860x_backlight_exit); +module_platform_driver(pm860x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Marvell Semiconductor 88PM8606"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/video/backlight/adp5520_bl.c b/drivers/video/backlight/adp5520_bl.c index dfb763e..2e630bf 100644 --- a/drivers/video/backlight/adp5520_bl.c +++ b/drivers/video/backlight/adp5520_bl.c @@ -384,17 +384,7 @@ static struct platform_driver adp5520_bl_driver = { .resume = adp5520_bl_resume, }; -static int __init adp5520_bl_init(void) -{ - return platform_driver_register(&adp5520_bl_driver); -} -module_init(adp5520_bl_init); - -static void __exit adp5520_bl_exit(void) -{ - platform_driver_unregister(&adp5520_bl_driver); -} -module_exit(adp5520_bl_exit); +module_platform_driver(adp5520_bl_driver); MODULE_AUTHOR("Michael Hennerich "); MODULE_DESCRIPTION("ADP5520(01) Backlight Driver"); diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c index d68f14b..abb4a06 100644 --- a/drivers/video/backlight/da903x_bl.c +++ b/drivers/video/backlight/da903x_bl.c @@ -199,17 +199,7 @@ static struct platform_driver da903x_backlight_driver = { .remove = da903x_backlight_remove, }; -static int __init da903x_backlight_init(void) -{ - return platform_driver_register(&da903x_backlight_driver); -} -module_init(da903x_backlight_init); - -static void __exit da903x_backlight_exit(void) -{ - platform_driver_unregister(&da903x_backlight_driver); -} -module_exit(da903x_backlight_exit); +module_platform_driver(da903x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Dialog Semiconductor DA9030/DA9034"); MODULE_AUTHOR("Eric Miao " diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index c74a6f4..32b9167 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -144,17 +144,7 @@ static struct platform_driver ep93xxbl_driver = { .resume = ep93xxbl_resume, }; -static int __init ep93xxbl_init(void) -{ - return platform_driver_register(&ep93xxbl_driver); -} -module_init(ep93xxbl_init); - -static void __exit ep93xxbl_exit(void) -{ - platform_driver_unregister(&ep93xxbl_driver); -} -module_exit(ep93xxbl_exit); +module_platform_driver(ep93xxbl_driver); MODULE_DESCRIPTION("EP93xx Backlight Driver"); MODULE_AUTHOR("H Hartley Sweeten "); diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c index adb1914..9ce6170 100644 --- a/drivers/video/backlight/generic_bl.c +++ b/drivers/video/backlight/generic_bl.c @@ -132,18 +132,7 @@ static struct platform_driver genericbl_driver = { }, }; -static int __init genericbl_init(void) -{ - return platform_driver_register(&genericbl_driver); -} - -static void __exit genericbl_exit(void) -{ - platform_driver_unregister(&genericbl_driver); -} - -module_init(genericbl_init); -module_exit(genericbl_exit); +module_platform_driver(genericbl_driver); MODULE_AUTHOR("Richard Purdie "); MODULE_DESCRIPTION("Generic Backlight Driver"); diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c index de65d80..2f8af5d 100644 --- a/drivers/video/backlight/jornada720_bl.c +++ b/drivers/video/backlight/jornada720_bl.c @@ -147,19 +147,8 @@ static struct platform_driver jornada_bl_driver = { }, }; -static int __init jornada_bl_init(void) -{ - return platform_driver_register(&jornada_bl_driver); -} - -static void __exit jornada_bl_exit(void) -{ - platform_driver_unregister(&jornada_bl_driver); -} +module_platform_driver(jornada_bl_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 710/720/728 Backlight driver"); MODULE_LICENSE("GPL"); - -module_init(jornada_bl_init); -module_exit(jornada_bl_exit); diff --git a/drivers/video/backlight/jornada720_lcd.c b/drivers/video/backlight/jornada720_lcd.c index d2ff658..22d231a 100644 --- a/drivers/video/backlight/jornada720_lcd.c +++ b/drivers/video/backlight/jornada720_lcd.c @@ -135,19 +135,8 @@ static struct platform_driver jornada_lcd_driver = { }, }; -static int __init jornada_lcd_init(void) -{ - return platform_driver_register(&jornada_lcd_driver); -} - -static void __exit jornada_lcd_exit(void) -{ - platform_driver_unregister(&jornada_lcd_driver); -} +module_platform_driver(jornada_lcd_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 710/720/728 LCD driver"); MODULE_LICENSE("GPL"); - -module_init(jornada_lcd_init); -module_exit(jornada_lcd_exit); diff --git a/drivers/video/backlight/max8925_bl.c b/drivers/video/backlight/max8925_bl.c index 7bbc802..c915e3b 100644 --- a/drivers/video/backlight/max8925_bl.c +++ b/drivers/video/backlight/max8925_bl.c @@ -188,17 +188,7 @@ static struct platform_driver max8925_backlight_driver = { .remove = __devexit_p(max8925_backlight_remove), }; -static int __init max8925_backlight_init(void) -{ - return platform_driver_register(&max8925_backlight_driver); -} -module_init(max8925_backlight_init); - -static void __exit max8925_backlight_exit(void) -{ - platform_driver_unregister(&max8925_backlight_driver); -}; -module_exit(max8925_backlight_exit); +module_platform_driver(max8925_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Maxim MAX8925"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c index 08d26a7..d8cde27 100644 --- a/drivers/video/backlight/omap1_bl.c +++ b/drivers/video/backlight/omap1_bl.c @@ -195,18 +195,7 @@ static struct platform_driver omapbl_driver = { }, }; -static int __init omapbl_init(void) -{ - return platform_driver_register(&omapbl_driver); -} - -static void __exit omapbl_exit(void) -{ - platform_driver_unregister(&omapbl_driver); -} - -module_init(omapbl_init); -module_exit(omapbl_exit); +module_platform_driver(omapbl_driver); MODULE_AUTHOR("Andrzej Zaborowski "); MODULE_DESCRIPTION("OMAP LCD Backlight driver"); diff --git a/drivers/video/backlight/pcf50633-backlight.c b/drivers/video/backlight/pcf50633-backlight.c index ef5628d..13e88b7 100644 --- a/drivers/video/backlight/pcf50633-backlight.c +++ b/drivers/video/backlight/pcf50633-backlight.c @@ -173,17 +173,7 @@ static struct platform_driver pcf50633_bl_driver = { }, }; -static int __init pcf50633_bl_init(void) -{ - return platform_driver_register(&pcf50633_bl_driver); -} -module_init(pcf50633_bl_init); - -static void __exit pcf50633_bl_exit(void) -{ - platform_driver_unregister(&pcf50633_bl_driver); -} -module_exit(pcf50633_bl_exit); +module_platform_driver(pcf50633_bl_driver); MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_DESCRIPTION("PCF50633 backlight driver"); diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index 302330a..187da59 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -157,18 +157,7 @@ static struct platform_driver platform_lcd_driver = { .resume = platform_lcd_resume, }; -static int __init platform_lcd_init(void) -{ - return platform_driver_register(&platform_lcd_driver); -} - -static void __exit platform_lcd_cleanup(void) -{ - platform_driver_unregister(&platform_lcd_driver); -} - -module_init(platform_lcd_init); -module_exit(platform_lcd_cleanup); +module_platform_driver(platform_lcd_driver); MODULE_AUTHOR("Ben Dooks "); MODULE_LICENSE("GPL v2"); diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 8b5b2a4..b811e8f 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -207,17 +207,7 @@ static struct platform_driver pwm_backlight_driver = { .resume = pwm_backlight_resume, }; -static int __init pwm_backlight_init(void) -{ - return platform_driver_register(&pwm_backlight_driver); -} -module_init(pwm_backlight_init); - -static void __exit pwm_backlight_exit(void) -{ - platform_driver_unregister(&pwm_backlight_driver); -} -module_exit(pwm_backlight_exit); +module_platform_driver(pwm_backlight_driver); MODULE_DESCRIPTION("PWM based Backlight Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/video/backlight/wm831x_bl.c b/drivers/video/backlight/wm831x_bl.c index fbe9e93..4e915f5 100644 --- a/drivers/video/backlight/wm831x_bl.c +++ b/drivers/video/backlight/wm831x_bl.c @@ -236,17 +236,7 @@ static struct platform_driver wm831x_backlight_driver = { .remove = wm831x_backlight_remove, }; -static int __init wm831x_backlight_init(void) -{ - return platform_driver_register(&wm831x_backlight_driver); -} -module_init(wm831x_backlight_init); - -static void __exit wm831x_backlight_exit(void) -{ - platform_driver_unregister(&wm831x_backlight_driver); -} -module_exit(wm831x_backlight_exit); +module_platform_driver(wm831x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for WM831x PMICs"); MODULE_AUTHOR("Mark Brown Date: Tue, 10 Jan 2012 15:09:15 -0800 Subject: backlight/ld9040.c: regulator control in the driver This patch supports regulator power control in the driver. Current ld9040 driver was controlled power on/off sequence by callback function in the board file. But, by doing this, there's no need to register lcd power on/off callback function in the board file. Signed-off-by: Donghwa Lee Signed-off-by: Kyungmin Park Signed-off-by: Inki Dae Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/ld9040.c b/drivers/video/backlight/ld9040.c index da9a5ce..78dafc0 100644 --- a/drivers/video/backlight/ld9040.c +++ b/drivers/video/backlight/ld9040.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "ld9040_gamma.h" @@ -53,8 +54,51 @@ struct ld9040 { struct lcd_device *ld; struct backlight_device *bd; struct lcd_platform_data *lcd_pd; + + struct mutex lock; + bool enabled; +}; + +static struct regulator_bulk_data supplies[] = { + { .supply = "vdd3", }, + { .supply = "vci", }, }; +static void ld9040_regulator_enable(struct ld9040 *lcd) +{ + int ret = 0; + struct lcd_platform_data *pd = NULL; + + pd = lcd->lcd_pd; + mutex_lock(&lcd->lock); + if (!lcd->enabled) { + ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies); + if (ret) + goto out; + + lcd->enabled = true; + } + mdelay(pd->power_on_delay); +out: + mutex_unlock(&lcd->lock); +} + +static void ld9040_regulator_disable(struct ld9040 *lcd) +{ + int ret = 0; + + mutex_lock(&lcd->lock); + if (lcd->enabled) { + ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies); + if (ret) + goto out; + + lcd->enabled = false; + } +out: + mutex_unlock(&lcd->lock); +} + static const unsigned short seq_swreset[] = { 0x01, COMMAND_ONLY, ENDDEF, 0x00 @@ -532,13 +576,8 @@ static int ld9040_power_on(struct ld9040 *lcd) return -EFAULT; } - if (!pd->power_on) { - dev_err(lcd->dev, "power_on is NULL.\n"); - return -EFAULT; - } else { - pd->power_on(lcd->ld, 1); - mdelay(pd->power_on_delay); - } + /* lcd power on */ + ld9040_regulator_enable(lcd); if (!pd->reset) { dev_err(lcd->dev, "reset is NULL.\n"); @@ -582,11 +621,8 @@ static int ld9040_power_off(struct ld9040 *lcd) mdelay(pd->power_off_delay); - if (!pd->power_on) { - dev_err(lcd->dev, "power_on is NULL.\n"); - return -EFAULT; - } else - pd->power_on(lcd->ld, 0); + /* lcd power off */ + ld9040_regulator_disable(lcd); return 0; } @@ -693,6 +729,14 @@ static int ld9040_probe(struct spi_device *spi) goto out_free_lcd; } + mutex_init(&lcd->lock); + + ret = regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies); + if (ret) { + dev_err(lcd->dev, "Failed to get regulators: %d\n", ret); + goto out_free_lcd; + } + ld = lcd_device_register("ld9040", &spi->dev, lcd, &ld9040_lcd_ops); if (IS_ERR(ld)) { ret = PTR_ERR(ld); @@ -739,6 +783,8 @@ static int ld9040_probe(struct spi_device *spi) out_unregister_lcd: lcd_device_unregister(lcd->ld); out_free_lcd: + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); + kfree(lcd); return ret; } @@ -750,6 +796,7 @@ static int __devexit ld9040_remove(struct spi_device *spi) ld9040_power(lcd, FB_BLANK_POWERDOWN); backlight_device_unregister(lcd->bd); lcd_device_unregister(lcd->ld); + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); kfree(lcd); return 0; -- cgit v0.10.2 From 1cfc6fee34a4343d79357c46722eb840fbc04f46 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 10 Jan 2012 15:09:18 -0800 Subject: drivers/video/backlight/ep93xx_bl.c: remove duplicated header include module.h is included twice. Signed-off-by: Jingoo Han Acked-by: H Hartley Sweeten Cc: Ryan Mallon Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index 32b9167..b62b8b9 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include -- cgit v0.10.2 From 66655760bf38861299e3c8196f5303f886b0eef9 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 10 Jan 2012 15:09:19 -0800 Subject: backlight: use kstrtoul() The usage of simple_strtoul() or strict_strtoul() is not preferred. Thus, kstrtoul should be used. This patch also fixes checkpatch error as follows: ERROR: space required after that ',' (ctx:VxV) Signed-off-by: Jingoo Han Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 7363c1b..bf5b1ec 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -102,7 +102,7 @@ static void backlight_generate_event(struct backlight_device *bd, } static ssize_t backlight_show_power(struct device *dev, - struct device_attribute *attr,char *buf) + struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); @@ -116,7 +116,7 @@ static ssize_t backlight_store_power(struct device *dev, struct backlight_device *bd = to_backlight_device(dev); unsigned long power; - rc = strict_strtoul(buf, 0, &power); + rc = kstrtoul(buf, 0, &power); if (rc) return rc; @@ -150,7 +150,7 @@ static ssize_t backlight_store_brightness(struct device *dev, struct backlight_device *bd = to_backlight_device(dev); unsigned long brightness; - rc = strict_strtoul(buf, 0, &brightness); + rc = kstrtoul(buf, 0, &brightness); if (rc) return rc; diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index 71a11ca..79c1b0d 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -97,19 +97,16 @@ static ssize_t lcd_store_power(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc = -ENXIO; - char *endp; struct lcd_device *ld = to_lcd_device(dev); - int power = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long power; - if (isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = kstrtoul(buf, 0, &power); + if (rc) + return rc; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_power) { - pr_debug("lcd: set power to %d\n", power); + pr_debug("lcd: set power to %lu\n", power); ld->ops->set_power(ld, power); rc = count; } @@ -136,19 +133,16 @@ static ssize_t lcd_store_contrast(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc = -ENXIO; - char *endp; struct lcd_device *ld = to_lcd_device(dev); - int contrast = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long contrast; - if (isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = kstrtoul(buf, 0, &contrast); + if (rc) + return rc; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_contrast) { - pr_debug("lcd: set contrast to %d\n", contrast); + pr_debug("lcd: set contrast to %lu\n", contrast); ld->ops->set_contrast(ld, contrast); rc = count; } -- cgit v0.10.2 From 48e78e8cc87ab80617ef0c5a146701ca96a4a51d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:21 -0800 Subject: backlight: convert platform_lcd to devm_kzalloc() Saves some error handling code and eliminates a class of leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index 187da59..f0bf491 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -85,7 +85,8 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) return -EINVAL; } - plcd = kzalloc(sizeof(struct platform_lcd), GFP_KERNEL); + plcd = devm_kzalloc(&pdev->dev, sizeof(struct platform_lcd), + GFP_KERNEL); if (!plcd) { dev_err(dev, "no memory for state\n"); return -ENOMEM; @@ -98,7 +99,7 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) if (IS_ERR(plcd->lcd)) { dev_err(dev, "cannot register lcd device\n"); err = PTR_ERR(plcd->lcd); - goto err_mem; + goto err; } platform_set_drvdata(pdev, plcd); @@ -106,8 +107,7 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) return 0; - err_mem: - kfree(plcd); + err: return err; } @@ -116,7 +116,6 @@ static int __devexit platform_lcd_remove(struct platform_device *pdev) struct platform_lcd *plcd = platform_get_drvdata(pdev); lcd_device_unregister(plcd->lcd); - kfree(plcd); return 0; } -- cgit v0.10.2 From e2c17bc6f717a8847df2a867caec6ba4fe85f3fc Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:22 -0800 Subject: backlight: convert pwm_bl to dev_pm_ops Should be no functional changes, mainly a reorganisation to support future work. [akpm@linux-foundation.org: fix CONFIG_PM=n build] Signed-off-by: Mark Brown Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index b811e8f..7496d04 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -169,10 +169,9 @@ static int pwm_backlight_remove(struct platform_device *pdev) } #ifdef CONFIG_PM -static int pwm_backlight_suspend(struct platform_device *pdev, - pm_message_t state) +static int pwm_backlight_suspend(struct device *dev) { - struct backlight_device *bl = platform_get_drvdata(pdev); + struct backlight_device *bl = dev_get_drvdata(dev); struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); if (pb->notify) @@ -184,27 +183,29 @@ static int pwm_backlight_suspend(struct platform_device *pdev, return 0; } -static int pwm_backlight_resume(struct platform_device *pdev) +static int pwm_backlight_resume(struct device *dev) { - struct backlight_device *bl = platform_get_drvdata(pdev); + struct backlight_device *bl = dev_get_drvdata(dev); backlight_update_status(bl); return 0; } -#else -#define pwm_backlight_suspend NULL -#define pwm_backlight_resume NULL + +static SIMPLE_DEV_PM_OPS(pwm_backlight_pm_ops, pwm_backlight_suspend, + pwm_backlight_resume); + #endif static struct platform_driver pwm_backlight_driver = { .driver = { .name = "pwm-backlight", .owner = THIS_MODULE, +#ifdef CONFIG_PM + .pm = &pwm_backlight_pm_ops, +#endif }, .probe = pwm_backlight_probe, .remove = pwm_backlight_remove, - .suspend = pwm_backlight_suspend, - .resume = pwm_backlight_resume, }; module_platform_driver(pwm_backlight_driver); -- cgit v0.10.2 From 892a8843fbef07a7f2ab62d5f7ff5c16ea0903b0 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:24 -0800 Subject: leds: convert led platform drivers to module_platform_driver Factor out some boilerplate code for platform driver registration into module_platform_driver. Signed-off-by: Axel Lin Acked-by: Haojian Zhuang [led-88pm860x.c] Acked-by: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Cc: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-88pm860x.c b/drivers/leds/leds-88pm860x.c index 0810604..4ca0062 100644 --- a/drivers/leds/leds-88pm860x.c +++ b/drivers/leds/leds-88pm860x.c @@ -238,17 +238,7 @@ static struct platform_driver pm860x_led_driver = { .remove = pm860x_led_remove, }; -static int __devinit pm860x_led_init(void) -{ - return platform_driver_register(&pm860x_led_driver); -} -module_init(pm860x_led_init); - -static void __devexit pm860x_led_exit(void) -{ - platform_driver_unregister(&pm860x_led_driver); -} -module_exit(pm860x_led_exit); +module_platform_driver(pm860x_led_driver); MODULE_DESCRIPTION("LED driver for Marvell PM860x"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/leds/leds-adp5520.c b/drivers/leds/leds-adp5520.c index 7ba4c7b..b1400db 100644 --- a/drivers/leds/leds-adp5520.c +++ b/drivers/leds/leds-adp5520.c @@ -213,17 +213,7 @@ static struct platform_driver adp5520_led_driver = { .remove = __devexit_p(adp5520_led_remove), }; -static int __init adp5520_led_init(void) -{ - return platform_driver_register(&adp5520_led_driver); -} -module_init(adp5520_led_init); - -static void __exit adp5520_led_exit(void) -{ - platform_driver_unregister(&adp5520_led_driver); -} -module_exit(adp5520_led_exit); +module_platform_driver(adp5520_led_driver); MODULE_AUTHOR("Michael Hennerich "); MODULE_DESCRIPTION("LEDS ADP5520(01) Driver"); diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c index 8c00937..0742835 100644 --- a/drivers/leds/leds-ams-delta.c +++ b/drivers/leds/leds-ams-delta.c @@ -118,18 +118,7 @@ static struct platform_driver ams_delta_led_driver = { }, }; -static int __init ams_delta_led_init(void) -{ - return platform_driver_register(&ams_delta_led_driver); -} - -static void __exit ams_delta_led_exit(void) -{ - platform_driver_unregister(&ams_delta_led_driver); -} - -module_init(ams_delta_led_init); -module_exit(ams_delta_led_exit); +module_platform_driver(ams_delta_led_driver); MODULE_AUTHOR("Jonathan McDowell "); MODULE_DESCRIPTION("Amstrad Delta LED driver"); diff --git a/drivers/leds/leds-asic3.c b/drivers/leds/leds-asic3.c index 48d9fe6..525a924 100644 --- a/drivers/leds/leds-asic3.c +++ b/drivers/leds/leds-asic3.c @@ -179,21 +179,9 @@ static struct platform_driver asic3_led_driver = { }, }; -MODULE_ALIAS("platform:leds-asic3"); - -static int __init asic3_led_init(void) -{ - return platform_driver_register(&asic3_led_driver); -} - -static void __exit asic3_led_exit(void) -{ - platform_driver_unregister(&asic3_led_driver); -} - -module_init(asic3_led_init); -module_exit(asic3_led_exit); +module_platform_driver(asic3_led_driver); MODULE_AUTHOR("Paul Parsons "); MODULE_DESCRIPTION("HTC ASIC3 LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-asic3"); diff --git a/drivers/leds/leds-atmel-pwm.c b/drivers/leds/leds-atmel-pwm.c index 109c875..800243b 100644 --- a/drivers/leds/leds-atmel-pwm.c +++ b/drivers/leds/leds-atmel-pwm.c @@ -134,29 +134,18 @@ static int __exit pwmled_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:leds-atmel-pwm"); - static struct platform_driver pwmled_driver = { .driver = { .name = "leds-atmel-pwm", .owner = THIS_MODULE, }, /* REVISIT add suspend() and resume() methods */ + .probe = pwmled_probe, .remove = __exit_p(pwmled_remove), }; -static int __init modinit(void) -{ - return platform_driver_probe(&pwmled_driver, pwmled_probe); -} -module_init(modinit); - -static void __exit modexit(void) -{ - platform_driver_unregister(&pwmled_driver); -} -module_exit(modexit); +module_platform_driver(pwmled_driver); MODULE_DESCRIPTION("Driver for LEDs with PWM-controlled brightness"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-atmel-pwm"); diff --git a/drivers/leds/leds-cobalt-qube.c b/drivers/leds/leds-cobalt-qube.c index da5fb01..6a8725c 100644 --- a/drivers/leds/leds-cobalt-qube.c +++ b/drivers/leds/leds-cobalt-qube.c @@ -75,9 +75,6 @@ static int __devexit cobalt_qube_led_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:cobalt-qube-leds"); - static struct platform_driver cobalt_qube_led_driver = { .probe = cobalt_qube_led_probe, .remove = __devexit_p(cobalt_qube_led_remove), @@ -87,19 +84,9 @@ static struct platform_driver cobalt_qube_led_driver = { }, }; -static int __init cobalt_qube_led_init(void) -{ - return platform_driver_register(&cobalt_qube_led_driver); -} - -static void __exit cobalt_qube_led_exit(void) -{ - platform_driver_unregister(&cobalt_qube_led_driver); -} - -module_init(cobalt_qube_led_init); -module_exit(cobalt_qube_led_exit); +module_platform_driver(cobalt_qube_led_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Front LED support for Cobalt Server"); MODULE_AUTHOR("Florian Fainelli "); +MODULE_ALIAS("platform:cobalt-qube-leds"); diff --git a/drivers/leds/leds-da903x.c b/drivers/leds/leds-da903x.c index f28931c..d9cd73e 100644 --- a/drivers/leds/leds-da903x.c +++ b/drivers/leds/leds-da903x.c @@ -158,17 +158,7 @@ static struct platform_driver da903x_led_driver = { .remove = __devexit_p(da903x_led_remove), }; -static int __init da903x_led_init(void) -{ - return platform_driver_register(&da903x_led_driver); -} -module_init(da903x_led_init); - -static void __exit da903x_led_exit(void) -{ - platform_driver_unregister(&da903x_led_driver); -} -module_exit(da903x_led_exit); +module_platform_driver(da903x_led_driver); MODULE_DESCRIPTION("LEDs driver for Dialog Semiconductor DA9030/DA9034"); MODULE_AUTHOR("Eric Miao " diff --git a/drivers/leds/leds-fsg.c b/drivers/leds/leds-fsg.c index 49aceff..b9053fa 100644 --- a/drivers/leds/leds-fsg.c +++ b/drivers/leds/leds-fsg.c @@ -224,20 +224,7 @@ static struct platform_driver fsg_led_driver = { }, }; - -static int __init fsg_led_init(void) -{ - return platform_driver_register(&fsg_led_driver); -} - -static void __exit fsg_led_exit(void) -{ - platform_driver_unregister(&fsg_led_driver); -} - - -module_init(fsg_led_init); -module_exit(fsg_led_exit); +module_platform_driver(fsg_led_driver); MODULE_AUTHOR("Rod Whitby "); MODULE_DESCRIPTION("Freecom FSG-3 LED driver"); diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 399a86f..7df74cb 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -293,21 +293,9 @@ static struct platform_driver gpio_led_driver = { }, }; -MODULE_ALIAS("platform:leds-gpio"); - -static int __init gpio_led_init(void) -{ - return platform_driver_register(&gpio_led_driver); -} - -static void __exit gpio_led_exit(void) -{ - platform_driver_unregister(&gpio_led_driver); -} - -module_init(gpio_led_init); -module_exit(gpio_led_exit); +module_platform_driver(gpio_led_driver); MODULE_AUTHOR("Raphael Assenat , Trent Piepho "); MODULE_DESCRIPTION("GPIO LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-gpio"); diff --git a/drivers/leds/leds-hp6xx.c b/drivers/leds/leds-hp6xx.c index bcfbd3a..366b605 100644 --- a/drivers/leds/leds-hp6xx.c +++ b/drivers/leds/leds-hp6xx.c @@ -79,9 +79,6 @@ static int hp6xxled_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:hp6xx-led"); - static struct platform_driver hp6xxled_driver = { .probe = hp6xxled_probe, .remove = hp6xxled_remove, @@ -91,19 +88,9 @@ static struct platform_driver hp6xxled_driver = { }, }; -static int __init hp6xxled_init(void) -{ - return platform_driver_register(&hp6xxled_driver); -} - -static void __exit hp6xxled_exit(void) -{ - platform_driver_unregister(&hp6xxled_driver); -} - -module_init(hp6xxled_init); -module_exit(hp6xxled_exit); +module_platform_driver(hp6xxled_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 6xx LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:hp6xx-led"); diff --git a/drivers/leds/leds-lt3593.c b/drivers/leds/leds-lt3593.c index 53f67b8..e311a96c 100644 --- a/drivers/leds/leds-lt3593.c +++ b/drivers/leds/leds-lt3593.c @@ -199,21 +199,9 @@ static struct platform_driver lt3593_led_driver = { }, }; -MODULE_ALIAS("platform:leds-lt3593"); - -static int __init lt3593_led_init(void) -{ - return platform_driver_register(<3593_led_driver); -} - -static void __exit lt3593_led_exit(void) -{ - platform_driver_unregister(<3593_led_driver); -} - -module_init(lt3593_led_init); -module_exit(lt3593_led_exit); +module_platform_driver(lt3593_led_driver); MODULE_AUTHOR("Daniel Mack "); MODULE_DESCRIPTION("LED driver for LT3593 controllers"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-lt3593"); diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c index b3393a9..c61e8c4 100644 --- a/drivers/leds/leds-mc13783.c +++ b/drivers/leds/leds-mc13783.c @@ -385,17 +385,7 @@ static struct platform_driver mc13783_led_driver = { .remove = __devexit_p(mc13783_led_remove), }; -static int __init mc13783_led_init(void) -{ - return platform_driver_register(&mc13783_led_driver); -} -module_init(mc13783_led_init); - -static void __exit mc13783_led_exit(void) -{ - platform_driver_unregister(&mc13783_led_driver); -} -module_exit(mc13783_led_exit); +module_platform_driver(mc13783_led_driver); MODULE_DESCRIPTION("LEDs driver for Freescale MC13783 PMIC"); MODULE_AUTHOR("Philippe Retornaz "); diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index f2e51c1..8c7a4ea 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -429,21 +429,10 @@ static struct platform_driver netxbig_led_driver = { .owner = THIS_MODULE, }, }; -MODULE_ALIAS("platform:leds-netxbig"); - -static int __init netxbig_led_init(void) -{ - return platform_driver_register(&netxbig_led_driver); -} -static void __exit netxbig_led_exit(void) -{ - platform_driver_unregister(&netxbig_led_driver); -} - -module_init(netxbig_led_init); -module_exit(netxbig_led_exit); +module_platform_driver(netxbig_led_driver); MODULE_AUTHOR("Simon Guinot "); MODULE_DESCRIPTION("LED driver for LaCie xBig Network boards"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-netxbig"); diff --git a/drivers/leds/leds-ns2.c b/drivers/leds/leds-ns2.c index 37b7d0c..2f0a144 100644 --- a/drivers/leds/leds-ns2.c +++ b/drivers/leds/leds-ns2.c @@ -323,21 +323,10 @@ static struct platform_driver ns2_led_driver = { .owner = THIS_MODULE, }, }; -MODULE_ALIAS("platform:leds-ns2"); - -static int __init ns2_led_init(void) -{ - return platform_driver_register(&ns2_led_driver); -} -static void __exit ns2_led_exit(void) -{ - platform_driver_unregister(&ns2_led_driver); -} - -module_init(ns2_led_init); -module_exit(ns2_led_exit); +module_platform_driver(ns2_led_driver); MODULE_AUTHOR("Simon Guinot "); MODULE_DESCRIPTION("Network Space v2 LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-ns2"); diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c index 666daf7..3ed92f3 100644 --- a/drivers/leds/leds-pwm.c +++ b/drivers/leds/leds-pwm.c @@ -135,18 +135,7 @@ static struct platform_driver led_pwm_driver = { }, }; -static int __init led_pwm_init(void) -{ - return platform_driver_register(&led_pwm_driver); -} - -static void __exit led_pwm_exit(void) -{ - platform_driver_unregister(&led_pwm_driver); -} - -module_init(led_pwm_init); -module_exit(led_pwm_exit); +module_platform_driver(led_pwm_driver); MODULE_AUTHOR("Luotao Fu "); MODULE_DESCRIPTION("PWM LED driver for PXA"); diff --git a/drivers/leds/leds-rb532.c b/drivers/leds/leds-rb532.c index c3525f3..a7815b6 100644 --- a/drivers/leds/leds-rb532.c +++ b/drivers/leds/leds-rb532.c @@ -57,21 +57,9 @@ static struct platform_driver rb532_led_driver = { }, }; -static int __init rb532_led_init(void) -{ - return platform_driver_register(&rb532_led_driver); -} - -static void __exit rb532_led_exit(void) -{ - platform_driver_unregister(&rb532_led_driver); -} - -module_init(rb532_led_init); -module_exit(rb532_led_exit); - -MODULE_ALIAS("platform:rb532-led"); +module_platform_driver(rb532_led_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("User LED support for Routerboard532"); MODULE_AUTHOR("Phil Sutter "); +MODULE_ALIAS("platform:rb532-led"); diff --git a/drivers/leds/leds-regulator.c b/drivers/leds/leds-regulator.c index 8497f56..df7e963 100644 --- a/drivers/leds/leds-regulator.c +++ b/drivers/leds/leds-regulator.c @@ -229,17 +229,7 @@ static struct platform_driver regulator_led_driver = { .remove = __devexit_p(regulator_led_remove), }; -static int __init regulator_led_init(void) -{ - return platform_driver_register(®ulator_led_driver); -} -module_init(regulator_led_init); - -static void __exit regulator_led_exit(void) -{ - platform_driver_unregister(®ulator_led_driver); -} -module_exit(regulator_led_exit); +module_platform_driver(regulator_led_driver); MODULE_AUTHOR("Antonio Ospite "); MODULE_DESCRIPTION("Regulator driven LED driver"); diff --git a/drivers/leds/leds-renesas-tpu.c b/drivers/leds/leds-renesas-tpu.c index 3ee540e..32fe337 100644 --- a/drivers/leds/leds-renesas-tpu.c +++ b/drivers/leds/leds-renesas-tpu.c @@ -339,18 +339,7 @@ static struct platform_driver r_tpu_device_driver = { } }; -static int __init r_tpu_init(void) -{ - return platform_driver_register(&r_tpu_device_driver); -} - -static void __exit r_tpu_exit(void) -{ - platform_driver_unregister(&r_tpu_device_driver); -} - -module_init(r_tpu_init); -module_exit(r_tpu_exit); +module_platform_driver(r_tpu_device_driver); MODULE_AUTHOR("Magnus Damm"); MODULE_DESCRIPTION("Renesas TPU LED Driver"); diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c index 29f8b0f..bd0a5ed 100644 --- a/drivers/leds/leds-s3c24xx.c +++ b/drivers/leds/leds-s3c24xx.c @@ -121,18 +121,7 @@ static struct platform_driver s3c24xx_led_driver = { }, }; -static int __init s3c24xx_led_init(void) -{ - return platform_driver_register(&s3c24xx_led_driver); -} - -static void __exit s3c24xx_led_exit(void) -{ - platform_driver_unregister(&s3c24xx_led_driver); -} - -module_init(s3c24xx_led_init); -module_exit(s3c24xx_led_exit); +module_platform_driver(s3c24xx_led_driver); MODULE_AUTHOR("Ben Dooks "); MODULE_DESCRIPTION("S3C24XX LED driver"); diff --git a/drivers/leds/leds-wm831x-status.c b/drivers/leds/leds-wm831x-status.c index b1eb34c..444a68d 100644 --- a/drivers/leds/leds-wm831x-status.c +++ b/drivers/leds/leds-wm831x-status.c @@ -325,17 +325,7 @@ static struct platform_driver wm831x_status_driver = { .remove = wm831x_status_remove, }; -static int __devinit wm831x_status_init(void) -{ - return platform_driver_register(&wm831x_status_driver); -} -module_init(wm831x_status_init); - -static void wm831x_status_exit(void) -{ - platform_driver_unregister(&wm831x_status_driver); -} -module_exit(wm831x_status_exit); +module_platform_driver(wm831x_status_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("WM831x status LED driver"); diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c index 4a12765..390c0f6 100644 --- a/drivers/leds/leds-wm8350.c +++ b/drivers/leds/leds-wm8350.c @@ -295,17 +295,7 @@ static struct platform_driver wm8350_led_driver = { .shutdown = wm8350_led_shutdown, }; -static int __devinit wm8350_led_init(void) -{ - return platform_driver_register(&wm8350_led_driver); -} -module_init(wm8350_led_init); - -static void wm8350_led_exit(void) -{ - platform_driver_unregister(&wm8350_led_driver); -} -module_exit(wm8350_led_exit); +module_platform_driver(wm8350_led_driver); MODULE_AUTHOR("Mark Brown"); MODULE_DESCRIPTION("WM8350 LED driver"); -- cgit v0.10.2 From 09a0d183ef3d310ee9d0b835d9db741fda9d6d46 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:27 -0800 Subject: leds: convert led i2c drivers to module_i2c_driver Factor out some boilerplate code for i2c driver registration into module_i2c_driver. Signed-off-by: Axel Lin Cc: Haojian Zhuang Cc: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Cc: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c index ea21855..10e40971 100644 --- a/drivers/leds/leds-bd2802.c +++ b/drivers/leds/leds-bd2802.c @@ -813,17 +813,7 @@ static struct i2c_driver bd2802_i2c_driver = { .id_table = bd2802_id, }; -static int __init bd2802_init(void) -{ - return i2c_add_driver(&bd2802_i2c_driver); -} -module_init(bd2802_init); - -static void __exit bd2802_exit(void) -{ - i2c_del_driver(&bd2802_i2c_driver); -} -module_exit(bd2802_exit); +module_i2c_driver(bd2802_i2c_driver); MODULE_AUTHOR("Kim Kyuwon "); MODULE_DESCRIPTION("BD2802 LED driver"); diff --git a/drivers/leds/leds-lm3530.c b/drivers/leds/leds-lm3530.c index 0630e4f..45e6878 100644 --- a/drivers/leds/leds-lm3530.c +++ b/drivers/leds/leds-lm3530.c @@ -457,18 +457,7 @@ static struct i2c_driver lm3530_i2c_driver = { }, }; -static int __init lm3530_init(void) -{ - return i2c_add_driver(&lm3530_i2c_driver); -} - -static void __exit lm3530_exit(void) -{ - i2c_del_driver(&lm3530_i2c_driver); -} - -module_init(lm3530_init); -module_exit(lm3530_exit); +module_i2c_driver(lm3530_i2c_driver); MODULE_DESCRIPTION("Back Light driver for LM3530"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/leds/leds-lp3944.c b/drivers/leds/leds-lp3944.c index 9010c05..b8f9f0a 100644 --- a/drivers/leds/leds-lp3944.c +++ b/drivers/leds/leds-lp3944.c @@ -453,18 +453,7 @@ static struct i2c_driver lp3944_driver = { .id_table = lp3944_id, }; -static int __init lp3944_module_init(void) -{ - return i2c_add_driver(&lp3944_driver); -} - -static void __exit lp3944_module_exit(void) -{ - i2c_del_driver(&lp3944_driver); -} - -module_init(lp3944_module_init); -module_exit(lp3944_module_exit); +module_i2c_driver(lp3944_driver); MODULE_AUTHOR("Antonio Ospite "); MODULE_DESCRIPTION("LP3944 Fun Light Chip"); diff --git a/drivers/leds/leds-lp5521.c b/drivers/leds/leds-lp5521.c index cb641f1..d62a798 100644 --- a/drivers/leds/leds-lp5521.c +++ b/drivers/leds/leds-lp5521.c @@ -797,25 +797,7 @@ static struct i2c_driver lp5521_driver = { .id_table = lp5521_id, }; -static int __init lp5521_init(void) -{ - int ret; - - ret = i2c_add_driver(&lp5521_driver); - - if (ret < 0) - printk(KERN_ALERT "Adding lp5521 driver failed\n"); - - return ret; -} - -static void __exit lp5521_exit(void) -{ - i2c_del_driver(&lp5521_driver); -} - -module_init(lp5521_init); -module_exit(lp5521_exit); +module_i2c_driver(lp5521_driver); MODULE_AUTHOR("Mathias Nyman, Yuri Zaporozhets, Samu Onkalo"); MODULE_DESCRIPTION("LP5521 LED engine"); diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c index 5971e309..0170760 100644 --- a/drivers/leds/leds-lp5523.c +++ b/drivers/leds/leds-lp5523.c @@ -1021,25 +1021,7 @@ static struct i2c_driver lp5523_driver = { .id_table = lp5523_id, }; -static int __init lp5523_init(void) -{ - int ret; - - ret = i2c_add_driver(&lp5523_driver); - - if (ret < 0) - printk(KERN_ALERT "Adding lp5523 driver failed\n"); - - return ret; -} - -static void __exit lp5523_exit(void) -{ - i2c_del_driver(&lp5523_driver); -} - -module_init(lp5523_init); -module_exit(lp5523_exit); +module_i2c_driver(lp5523_driver); MODULE_AUTHOR("Mathias Nyman "); MODULE_DESCRIPTION("LP5523 LED engine"); diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c index a2c8746..ceccab4 100644 --- a/drivers/leds/leds-pca9532.c +++ b/drivers/leds/leds-pca9532.c @@ -489,20 +489,8 @@ static int pca9532_remove(struct i2c_client *client) return 0; } -static int __init pca9532_init(void) -{ - return i2c_add_driver(&pca9532_driver); -} - -static void __exit pca9532_exit(void) -{ - i2c_del_driver(&pca9532_driver); -} +module_i2c_driver(pca9532_driver); MODULE_AUTHOR("Riku Voipio"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PCA 9532 LED dimmer"); - -module_init(pca9532_init); -module_exit(pca9532_exit); - diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c index 66aa3e8..dcc3bc3 100644 --- a/drivers/leds/leds-pca955x.c +++ b/drivers/leds/leds-pca955x.c @@ -371,18 +371,7 @@ static struct i2c_driver pca955x_driver = { .id_table = pca955x_id, }; -static int __init pca955x_leds_init(void) -{ - return i2c_add_driver(&pca955x_driver); -} - -static void __exit pca955x_leds_exit(void) -{ - i2c_del_driver(&pca955x_driver); -} - -module_init(pca955x_leds_init); -module_exit(pca955x_leds_exit); +module_i2c_driver(pca955x_driver); MODULE_AUTHOR("Nate Case "); MODULE_DESCRIPTION("PCA955x LED driver"); -- cgit v0.10.2 From 19b955768b4ede7c9ad0efe4def70852c530d4f9 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:30 -0800 Subject: leds: convert leds-dac124s085 to module_spi_driver Factor out some boilerplate code for spi driver registration into module_spi_driver. Signed-off-by: Axel Lin Cc: Haojian Zhuang Cc: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Acked-by: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-dac124s085.c b/drivers/leds/leds-dac124s085.c index 31cf0d6..d56c142 100644 --- a/drivers/leds/leds-dac124s085.c +++ b/drivers/leds/leds-dac124s085.c @@ -131,18 +131,7 @@ static struct spi_driver dac124s085_driver = { }, }; -static int __init dac124s085_leds_init(void) -{ - return spi_register_driver(&dac124s085_driver); -} - -static void __exit dac124s085_leds_exit(void) -{ - spi_unregister_driver(&dac124s085_driver); -} - -module_init(dac124s085_leds_init); -module_exit(dac124s085_leds_exit); +module_spi_driver(dac124s085_driver); MODULE_AUTHOR("Guennadi Liakhovetski "); MODULE_DESCRIPTION("DAC124S085 LED driver"); -- cgit v0.10.2 From 1980bcfa6bc5a77491176ba695422e205dcfd2da Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:34 -0800 Subject: drivers/leds/leds-lp5523.c: remove unneeded forward declaration Signed-off-by: Axel Lin Cc: Samu Onkalo Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c index 0170760..73e791a 100644 --- a/drivers/leds/leds-lp5523.c +++ b/drivers/leds/leds-lp5523.c @@ -870,8 +870,6 @@ static int __devinit lp5523_init_led(struct lp5523_led *led, struct device *dev, return 0; } -static struct i2c_driver lp5523_driver; - static int __devinit lp5523_probe(struct i2c_client *client, const struct i2c_device_id *id) { -- cgit v0.10.2 From 95dafd475382740a841697a2ead6566175d26390 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:35 -0800 Subject: drivers/leds/leds-bd2802.c: use gpio_request_one() Use gpio_request_one() instead of multiple gpiolib calls. Signed-off-by: Axel Lin Cc: Kim Kyuwon Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c index 10e40971..591cbdf 100644 --- a/drivers/leds/leds-bd2802.c +++ b/drivers/leds/leds-bd2802.c @@ -688,8 +688,7 @@ static int __devinit bd2802_probe(struct i2c_client *client, i2c_set_clientdata(client, led); /* Configure RESET GPIO (L: RESET, H: RESET cancel) */ - gpio_request(pdata->reset_gpio, "RGB_RESETB"); - gpio_direction_output(pdata->reset_gpio, 1); + gpio_request_one(pdata->reset_gpio, GPIOF_OUT_INIT_HIGH, "RGB_RESETB"); /* Tacss = min 0.1ms */ udelay(100); -- cgit v0.10.2 From b96a573f4c27529d379922670e8cf5530120d5ca Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:37 -0800 Subject: drivers/leds/leds-netxbig.c: use gpio_request_one() Use gpio_request_one() instead of multiple gpiolib calls. This also simplifies error handling a bit. Signed-off-by: Axel Lin Cc: Simon Guinot Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index 8c7a4ea..d8433f2 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -81,35 +81,23 @@ static int __devinit gpio_ext_init(struct netxbig_gpio_ext *gpio_ext) /* Configure address GPIOs. */ for (i = 0; i < gpio_ext->num_addr; i++) { - err = gpio_request(gpio_ext->addr[i], "GPIO extension addr"); + err = gpio_request_one(gpio_ext->addr[i], GPIOF_OUT_INIT_LOW, + "GPIO extension addr"); if (err) goto err_free_addr; - err = gpio_direction_output(gpio_ext->addr[i], 0); - if (err) { - gpio_free(gpio_ext->addr[i]); - goto err_free_addr; - } } /* Configure data GPIOs. */ for (i = 0; i < gpio_ext->num_data; i++) { - err = gpio_request(gpio_ext->data[i], "GPIO extension data"); + err = gpio_request_one(gpio_ext->data[i], GPIOF_OUT_INIT_LOW, + "GPIO extension data"); if (err) goto err_free_data; - err = gpio_direction_output(gpio_ext->data[i], 0); - if (err) { - gpio_free(gpio_ext->data[i]); - goto err_free_data; - } } /* Configure "enable select" GPIO. */ - err = gpio_request(gpio_ext->enable, "GPIO extension enable"); + err = gpio_request_one(gpio_ext->enable, GPIOF_OUT_INIT_LOW, + "GPIO extension enable"); if (err) goto err_free_data; - err = gpio_direction_output(gpio_ext->enable, 0); - if (err) { - gpio_free(gpio_ext->enable); - goto err_free_data; - } return 0; -- cgit v0.10.2 From a6d511e5155406cd214d3af3ff9cffc69548b006 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 10 Jan 2012 15:09:40 -0800 Subject: leds: add driver for TCA6507 LED controller TI's TCA6507 is the LED driver in the GTA04 Openmoko motherboard. The driver provides full support for brightness levels and hardware blinking. This driver can drive each of 7 outputs as an LED or a GPIO output, and provides hardware-assist blinking. [akpm@linux-foundation.org: fix __mod_i2c_device_table alias] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: NeilBrown Cc: Richard Purdie Cc: Randy Dunlap Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 1b75a56..897a77d 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -388,6 +388,14 @@ config LEDS_RENESAS_TPU pin function. The latter to support brightness control. Brightness control is supported but hardware blinking is not. +config LEDS_TCA6507 + tristate "LED Support for TCA6507 I2C chip" + depends on LEDS_CLASS && I2C + help + This option enables support for LEDs connected to TC6507 + LED driver chips accessed via the I2C bus. + Driver support brightness control and hardware-assisted blinking. + config LEDS_TRIGGERS bool "LED Trigger support" depends on LEDS_CLASS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index e4f6bf5..5c9dc4b 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LEDS_GPIO) += leds-gpio.o obj-$(CONFIG_LEDS_LP3944) += leds-lp3944.o obj-$(CONFIG_LEDS_LP5521) += leds-lp5521.o obj-$(CONFIG_LEDS_LP5523) += leds-lp5523.o +obj-$(CONFIG_LEDS_TCA6507) += leds-tca6507.o obj-$(CONFIG_LEDS_CLEVO_MAIL) += leds-clevo-mail.o obj-$(CONFIG_LEDS_HP6XX) += leds-hp6xx.o obj-$(CONFIG_LEDS_FSG) += leds-fsg.o diff --git a/drivers/leds/leds-tca6507.c b/drivers/leds/leds-tca6507.c new file mode 100644 index 0000000..133f89f --- /dev/null +++ b/drivers/leds/leds-tca6507.c @@ -0,0 +1,779 @@ +/* + * leds-tca6507 + * + * The TCA6507 is a programmable LED controller that can drive 7 + * separate lines either by holding them low, or by pulsing them + * with modulated width. + * The modulation can be varied in a simple pattern to produce a blink or + * double-blink. + * + * This driver can configure each line either as a 'GPIO' which is out-only + * (no pull-up) or as an LED with variable brightness and hardware-assisted + * blinking. + * + * Apart from OFF and ON there are three programmable brightness levels which + * can be programmed from 0 to 15 and indicate how many 500usec intervals in + * each 8msec that the led is 'on'. The levels are named MASTER, BANK0 and + * BANK1. + * + * There are two different blink rates that can be programmed, each with + * separate time for rise, on, fall, off and second-off. Thus if 3 or more + * different non-trivial rates are required, software must be used for the extra + * rates. The two different blink rates must align with the two levels BANK0 and + * BANK1. + * This driver does not support double-blink so 'second-off' always matches + * 'off'. + * + * Only 16 different times can be programmed in a roughly logarithmic scale from + * 64ms to 16320ms. To be precise the possible times are: + * 0, 64, 128, 192, 256, 384, 512, 768, + * 1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320 + * + * Times that cannot be closely matched with these must be + * handled in software. This driver allows 12.5% error in matching. + * + * This driver does not allow rise/fall rates to be set explicitly. When trying + * to match a given 'on' or 'off' period, an appropriate pair of 'change' and + * 'hold' times are chosen to get a close match. If the target delay is even, + * the 'change' number will be the smaller; if odd, the 'hold' number will be + * the smaller. + + * Choosing pairs of delays with 12.5% errors allows us to match delays in the + * ranges: 56-72, 112-144, 168-216, 224-27504, 28560-36720. + * 26% of the achievable sums can be matched by multiple pairings. For example + * 1536 == 1536+0, 1024+512, or 768+768. This driver will always choose the + * pairing with the least maximum - 768+768 in this case. Other pairings are + * not available. + * + * Access to the 3 levels and 2 blinks are on a first-come, first-served basis. + * Access can be shared by multiple leds if they have the same level and + * either same blink rates, or some don't blink. + * When a led changes, it relinquishes access and tries again, so it might + * lose access to hardware blink. + * If a blink engine cannot be allocated, software blink is used. + * If the desired brightness cannot be allocated, the closest available non-zero + * brightness is used. As 'full' is always available, the worst case would be + * to have two different blink rates at '1', with Max at '2', then other leds + * will have to choose between '2' and '16'. Hopefully this is not likely. + * + * Each bank (BANK0 and BANK1) has two usage counts - LEDs using the brightness + * and LEDs using the blink. It can only be reprogrammed when the appropriate + * counter is zero. The MASTER level has a single usage count. + * + * Each Led has programmable 'on' and 'off' time as milliseconds. With each + * there is a flag saying if it was explicitly requested or defaulted. + * Similarly the banks know if each time was explicit or a default. Defaults + * are permitted to be changed freely - they are not recognised when matching. + * + * + * An led-tca6507 device must be provided with platform data. This data + * lists for each output: the name, default trigger, and whether the signal + * is being used as a GPiO rather than an led. 'struct led_plaform_data' + * is used for this. If 'name' is NULL, the output isn't used. If 'flags' + * is TCA6507_MAKE_CPIO, the output is a GPO. + * The "struct led_platform_data" can be embedded in a + * "struct tca6507_platform_data" which adds a 'gpio_base' for the GPiOs, + * and a 'setup' callback which is called once the GPiOs are available. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* LED select registers determine the source that drives LED outputs */ +#define TCA6507_LS_LED_OFF 0x0 /* Output HI-Z (off) */ +#define TCA6507_LS_LED_OFF1 0x1 /* Output HI-Z (off) - not used */ +#define TCA6507_LS_LED_PWM0 0x2 /* Output LOW with Bank0 rate */ +#define TCA6507_LS_LED_PWM1 0x3 /* Output LOW with Bank1 rate */ +#define TCA6507_LS_LED_ON 0x4 /* Output LOW (on) */ +#define TCA6507_LS_LED_MIR 0x5 /* Output LOW with Master Intensity */ +#define TCA6507_LS_BLINK0 0x6 /* Blink at Bank0 rate */ +#define TCA6507_LS_BLINK1 0x7 /* Blink at Bank1 rate */ + +enum { + BANK0, + BANK1, + MASTER, +}; +static int bank_source[3] = { + TCA6507_LS_LED_PWM0, + TCA6507_LS_LED_PWM1, + TCA6507_LS_LED_MIR, +}; +static int blink_source[2] = { + TCA6507_LS_BLINK0, + TCA6507_LS_BLINK1, +}; + +/* PWM registers */ +#define TCA6507_REG_CNT 11 + +/* + * 0x00, 0x01, 0x02 encode the TCA6507_LS_* values, each output + * owns one bit in each register + */ +#define TCA6507_FADE_ON 0x03 +#define TCA6507_FULL_ON 0x04 +#define TCA6507_FADE_OFF 0x05 +#define TCA6507_FIRST_OFF 0x06 +#define TCA6507_SECOND_OFF 0x07 +#define TCA6507_MAX_INTENSITY 0x08 +#define TCA6507_MASTER_INTENSITY 0x09 +#define TCA6507_INITIALIZE 0x0A + +#define INIT_CODE 0x8 + +#define TIMECODES 16 +static int time_codes[TIMECODES] = { + 0, 64, 128, 192, 256, 384, 512, 768, + 1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320 +}; + +/* Convert an led.brightness level (0..255) to a TCA6507 level (0..15) */ +static inline int TO_LEVEL(int brightness) +{ + return brightness >> 4; +} + +/* ...and convert back */ +static inline int TO_BRIGHT(int level) +{ + if (level) + return (level << 4) | 0xf; + return 0; +} + +#define NUM_LEDS 7 +struct tca6507_chip { + int reg_set; /* One bit per register where + * a '1' means the register + * should be written */ + u8 reg_file[TCA6507_REG_CNT]; + /* Bank 2 is Master Intensity and doesn't use times */ + struct bank { + int level; + int ontime, offtime; + int on_dflt, off_dflt; + int time_use, level_use; + } bank[3]; + struct i2c_client *client; + struct work_struct work; + spinlock_t lock; + + struct tca6507_led { + struct tca6507_chip *chip; + struct led_classdev led_cdev; + int num; + int ontime, offtime; + int on_dflt, off_dflt; + int bank; /* Bank used, or -1 */ + int blink; /* Set if hardware-blinking */ + } leds[NUM_LEDS]; +#ifdef CONFIG_GPIOLIB + struct gpio_chip gpio; + const char *gpio_name[NUM_LEDS]; + int gpio_map[NUM_LEDS]; +#endif +}; + +static const struct i2c_device_id tca6507_id[] = { + { "tca6507" }, + { } +}; +MODULE_DEVICE_TABLE(i2c, tca6507_id); + +static int choose_times(int msec, int *c1p, int *c2p) +{ + /* + * Choose two timecodes which add to 'msec' as near as possible. + * The first returned is the 'on' or 'off' time. The second is to be + * used as a 'fade-on' or 'fade-off' time. If 'msec' is even, + * the first will not be smaller than the second. If 'msec' is odd, + * the first will not be larger than the second. + * If we cannot get a sum within 1/8 of 'msec' fail with -EINVAL, + * otherwise return the sum that was achieved, plus 1 if the first is + * smaller. + * If two possibilities are equally good (e.g. 512+0, 256+256), choose + * the first pair so there is more change-time visible (i.e. it is + * softer). + */ + int c1, c2; + int tmax = msec * 9 / 8; + int tmin = msec * 7 / 8; + int diff = 65536; + + /* We start at '1' to ensure we never even think of choosing a + * total time of '0'. + */ + for (c1 = 1; c1 < TIMECODES; c1++) { + int t = time_codes[c1]; + if (t*2 < tmin) + continue; + if (t > tmax) + break; + for (c2 = 0; c2 <= c1; c2++) { + int tt = t + time_codes[c2]; + int d; + if (tt < tmin) + continue; + if (tt > tmax) + break; + /* This works! */ + d = abs(msec - tt); + if (d >= diff) + continue; + /* Best yet */ + *c1p = c1; + *c2p = c2; + diff = d; + if (d == 0) + return msec; + } + } + if (diff < 65536) { + int actual; + if (msec & 1) { + c1 = *c2p; + *c2p = *c1p; + *c1p = c1; + } + actual = time_codes[*c1p] + time_codes[*c2p]; + if (*c1p < *c2p) + return actual + 1; + else + return actual; + } + /* No close match */ + return -EINVAL; +} + +/* + * Update the register file with the appropriate 3-bit state for + * the given led. + */ +static void set_select(struct tca6507_chip *tca, int led, int val) +{ + int mask = (1 << led); + int bit; + + for (bit = 0; bit < 3; bit++) { + int n = tca->reg_file[bit] & ~mask; + if (val & (1 << bit)) + n |= mask; + if (tca->reg_file[bit] != n) { + tca->reg_file[bit] = n; + tca->reg_set |= (1 << bit); + } + } +} + +/* Update the register file with the appropriate 4-bit code for + * one bank or other. This can be used for timers, for levels, or + * for initialisation. + */ +static void set_code(struct tca6507_chip *tca, int reg, int bank, int new) +{ + int mask = 0xF; + int n; + if (bank) { + mask <<= 4; + new <<= 4; + } + n = tca->reg_file[reg] & ~mask; + n |= new; + if (tca->reg_file[reg] != n) { + tca->reg_file[reg] = n; + tca->reg_set |= 1 << reg; + } +} + +/* Update brightness level. */ +static void set_level(struct tca6507_chip *tca, int bank, int level) +{ + switch (bank) { + case BANK0: + case BANK1: + set_code(tca, TCA6507_MAX_INTENSITY, bank, level); + break; + case MASTER: + set_code(tca, TCA6507_MASTER_INTENSITY, 0, level); + break; + } + tca->bank[bank].level = level; +} + +/* Record all relevant time code for a given bank */ +static void set_times(struct tca6507_chip *tca, int bank) +{ + int c1, c2; + int result; + + result = choose_times(tca->bank[bank].ontime, &c1, &c2); + dev_dbg(&tca->client->dev, + "Chose on times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1], + c2, time_codes[c2], tca->bank[bank].ontime); + set_code(tca, TCA6507_FADE_ON, bank, c2); + set_code(tca, TCA6507_FULL_ON, bank, c1); + tca->bank[bank].ontime = result; + + result = choose_times(tca->bank[bank].offtime, &c1, &c2); + dev_dbg(&tca->client->dev, + "Chose off times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1], + c2, time_codes[c2], tca->bank[bank].offtime); + set_code(tca, TCA6507_FADE_OFF, bank, c2); + set_code(tca, TCA6507_FIRST_OFF, bank, c1); + set_code(tca, TCA6507_SECOND_OFF, bank, c1); + tca->bank[bank].offtime = result; + + set_code(tca, TCA6507_INITIALIZE, bank, INIT_CODE); +} + +/* Write all needed register of tca6507 */ + +static void tca6507_work(struct work_struct *work) +{ + struct tca6507_chip *tca = container_of(work, struct tca6507_chip, + work); + struct i2c_client *cl = tca->client; + int set; + u8 file[TCA6507_REG_CNT]; + int r; + + spin_lock_irq(&tca->lock); + set = tca->reg_set; + memcpy(file, tca->reg_file, TCA6507_REG_CNT); + tca->reg_set = 0; + spin_unlock_irq(&tca->lock); + + for (r = 0; r < TCA6507_REG_CNT; r++) + if (set & (1<chip; + if (led->bank >= 0) { + struct bank *b = tca->bank + led->bank; + if (led->blink) + b->time_use--; + b->level_use--; + } + led->blink = 0; + led->bank = -1; +} + +static int led_prepare(struct tca6507_led *led) +{ + /* Assign this led to a bank, configuring that bank if necessary. */ + int level = TO_LEVEL(led->led_cdev.brightness); + struct tca6507_chip *tca = led->chip; + int c1, c2; + int i; + struct bank *b; + int need_init = 0; + + led->led_cdev.brightness = TO_BRIGHT(level); + if (level == 0) { + set_select(tca, led->num, TCA6507_LS_LED_OFF); + return 0; + } + + if (led->ontime == 0 || led->offtime == 0) { + /* + * Just set the brightness, choosing first usable bank. + * If none perfect, choose best. + * Count backwards so we check MASTER bank first + * to avoid wasting a timer. + */ + int best = -1;/* full-on */ + int diff = 15-level; + + if (level == 15) { + set_select(tca, led->num, TCA6507_LS_LED_ON); + return 0; + } + + for (i = MASTER; i >= BANK0; i--) { + int d; + if (tca->bank[i].level == level || + tca->bank[i].level_use == 0) { + best = i; + break; + } + d = abs(level - tca->bank[i].level); + if (d < diff) { + diff = d; + best = i; + } + } + if (best == -1) { + /* Best brightness is full-on */ + set_select(tca, led->num, TCA6507_LS_LED_ON); + led->led_cdev.brightness = LED_FULL; + return 0; + } + + if (!tca->bank[best].level_use) + set_level(tca, best, level); + + tca->bank[best].level_use++; + led->bank = best; + set_select(tca, led->num, bank_source[best]); + led->led_cdev.brightness = TO_BRIGHT(tca->bank[best].level); + return 0; + } + + /* + * We have on/off time so we need to try to allocate a timing bank. + * First check if times are compatible with hardware and give up if + * not. + */ + if (choose_times(led->ontime, &c1, &c2) < 0) + return -EINVAL; + if (choose_times(led->offtime, &c1, &c2) < 0) + return -EINVAL; + + for (i = BANK0; i <= BANK1; i++) { + if (tca->bank[i].level_use == 0) + /* not in use - it is ours! */ + break; + if (tca->bank[i].level != level) + /* Incompatible level - skip */ + /* FIX: if timer matches we maybe should consider + * this anyway... + */ + continue; + + if (tca->bank[i].time_use == 0) + /* Timer not in use, and level matches - use it */ + break; + + if (!(tca->bank[i].on_dflt || + led->on_dflt || + tca->bank[i].ontime == led->ontime)) + /* on time is incompatible */ + continue; + + if (!(tca->bank[i].off_dflt || + led->off_dflt || + tca->bank[i].offtime == led->offtime)) + /* off time is incompatible */ + continue; + + /* looks like a suitable match */ + break; + } + + if (i > BANK1) + /* Nothing matches - how sad */ + return -EINVAL; + + b = &tca->bank[i]; + if (b->level_use == 0) + set_level(tca, i, level); + b->level_use++; + led->bank = i; + + if (b->on_dflt || + !led->on_dflt || + b->time_use == 0) { + b->ontime = led->ontime; + b->on_dflt = led->on_dflt; + need_init = 1; + } + + if (b->off_dflt || + !led->off_dflt || + b->time_use == 0) { + b->offtime = led->offtime; + b->off_dflt = led->off_dflt; + need_init = 1; + } + + if (need_init) + set_times(tca, i); + + led->ontime = b->ontime; + led->offtime = b->offtime; + + b->time_use++; + led->blink = 1; + led->led_cdev.brightness = TO_BRIGHT(b->level); + set_select(tca, led->num, blink_source[i]); + return 0; +} + +static int led_assign(struct tca6507_led *led) +{ + struct tca6507_chip *tca = led->chip; + int err; + unsigned long flags; + + spin_lock_irqsave(&tca->lock, flags); + led_release(led); + err = led_prepare(led); + if (err) { + /* + * Can only fail on timer setup. In that case we need to + * re-establish as steady level. + */ + led->ontime = 0; + led->offtime = 0; + led_prepare(led); + } + spin_unlock_irqrestore(&tca->lock, flags); + + if (tca->reg_set) + schedule_work(&tca->work); + return err; +} + +static void tca6507_brightness_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct tca6507_led *led = container_of(led_cdev, struct tca6507_led, + led_cdev); + led->led_cdev.brightness = brightness; + led->ontime = 0; + led->offtime = 0; + led_assign(led); +} + +static int tca6507_blink_set(struct led_classdev *led_cdev, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct tca6507_led *led = container_of(led_cdev, struct tca6507_led, + led_cdev); + + if (*delay_on == 0) + led->on_dflt = 1; + else if (delay_on != &led_cdev->blink_delay_on) + led->on_dflt = 0; + led->ontime = *delay_on; + + if (*delay_off == 0) + led->off_dflt = 1; + else if (delay_off != &led_cdev->blink_delay_off) + led->off_dflt = 0; + led->offtime = *delay_off; + + if (led->ontime == 0) + led->ontime = 512; + if (led->offtime == 0) + led->offtime = 512; + + if (led->led_cdev.brightness == LED_OFF) + led->led_cdev.brightness = LED_FULL; + if (led_assign(led) < 0) { + led->ontime = 0; + led->offtime = 0; + led->led_cdev.brightness = LED_OFF; + return -EINVAL; + } + *delay_on = led->ontime; + *delay_off = led->offtime; + return 0; +} + +#ifdef CONFIG_GPIOLIB +static void tca6507_gpio_set_value(struct gpio_chip *gc, + unsigned offset, int val) +{ + struct tca6507_chip *tca = container_of(gc, struct tca6507_chip, gpio); + unsigned long flags; + + spin_lock_irqsave(&tca->lock, flags); + /* + * 'OFF' is floating high, and 'ON' is pulled down, so it has the + * inverse sense of 'val'. + */ + set_select(tca, tca->gpio_map[offset], + val ? TCA6507_LS_LED_OFF : TCA6507_LS_LED_ON); + spin_unlock_irqrestore(&tca->lock, flags); + if (tca->reg_set) + schedule_work(&tca->work); +} + +static int tca6507_gpio_direction_output(struct gpio_chip *gc, + unsigned offset, int val) +{ + tca6507_gpio_set_value(gc, offset, val); + return 0; +} + +static int tca6507_probe_gpios(struct i2c_client *client, + struct tca6507_chip *tca, + struct tca6507_platform_data *pdata) +{ + int err; + int i = 0; + int gpios = 0; + + for (i = 0; i < NUM_LEDS; i++) + if (pdata->leds.leds[i].name && pdata->leds.leds[i].flags) { + /* Configure as a gpio */ + tca->gpio_name[gpios] = pdata->leds.leds[i].name; + tca->gpio_map[gpios] = i; + gpios++; + } + + if (!gpios) + return 0; + + tca->gpio.label = "gpio-tca6507"; + tca->gpio.names = tca->gpio_name; + tca->gpio.ngpio = gpios; + tca->gpio.base = pdata->gpio_base; + tca->gpio.owner = THIS_MODULE; + tca->gpio.direction_output = tca6507_gpio_direction_output; + tca->gpio.set = tca6507_gpio_set_value; + tca->gpio.dev = &client->dev; + err = gpiochip_add(&tca->gpio); + if (err) { + tca->gpio.ngpio = 0; + return err; + } + if (pdata->setup) + pdata->setup(tca->gpio.base, tca->gpio.ngpio); + return 0; +} + +static void tca6507_remove_gpio(struct tca6507_chip *tca) +{ + if (tca->gpio.ngpio) { + int err = gpiochip_remove(&tca->gpio); + dev_err(&tca->client->dev, "%s failed, %d\n", + "gpiochip_remove()", err); + } +} +#else /* CONFIG_GPIOLIB */ +static int tca6507_probe_gpios(struct i2c_client *client, + struct tca6507_chip *tca, + struct tca6507_platform_data *pdata) +{ + return 0; +} +static void tca6507_remove_gpio(struct tca6507_chip *tca) +{ +} +#endif /* CONFIG_GPIOLIB */ + +static int __devinit tca6507_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct tca6507_chip *tca; + struct i2c_adapter *adapter; + struct tca6507_platform_data *pdata; + int err; + int i = 0; + + adapter = to_i2c_adapter(client->dev.parent); + pdata = client->dev.platform_data; + + if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) + return -EIO; + + if (!pdata || pdata->leds.num_leds != NUM_LEDS) { + dev_err(&client->dev, "Need %d entries in platform-data list\n", + NUM_LEDS); + return -ENODEV; + } + err = -ENOMEM; + tca = kzalloc(sizeof(*tca), GFP_KERNEL); + if (!tca) + goto exit; + + tca->client = client; + INIT_WORK(&tca->work, tca6507_work); + spin_lock_init(&tca->lock); + i2c_set_clientdata(client, tca); + + for (i = 0; i < NUM_LEDS; i++) { + struct tca6507_led *l = tca->leds + i; + + l->chip = tca; + l->num = i; + if (pdata->leds.leds[i].name && !pdata->leds.leds[i].flags) { + l->led_cdev.name = pdata->leds.leds[i].name; + l->led_cdev.default_trigger + = pdata->leds.leds[i].default_trigger; + l->led_cdev.brightness_set = tca6507_brightness_set; + l->led_cdev.blink_set = tca6507_blink_set; + l->bank = -1; + err = led_classdev_register(&client->dev, + &l->led_cdev); + if (err < 0) + goto exit; + } + } + err = tca6507_probe_gpios(client, tca, pdata); + if (err) + goto exit; + /* set all registers to known state - zero */ + tca->reg_set = 0x7f; + schedule_work(&tca->work); + + return 0; +exit: + while (i--) + if (tca->leds[i].led_cdev.name) + led_classdev_unregister(&tca->leds[i].led_cdev); + cancel_work_sync(&tca->work); + i2c_set_clientdata(client, NULL); + kfree(tca); + return err; +} + +static int __devexit tca6507_remove(struct i2c_client *client) +{ + int i; + struct tca6507_chip *tca = i2c_get_clientdata(client); + struct tca6507_led *tca_leds = tca->leds; + + for (i = 0; i < NUM_LEDS; i++) { + if (tca_leds[i].led_cdev.name) + led_classdev_unregister(&tca_leds[i].led_cdev); + } + tca6507_remove_gpio(tca); + cancel_work_sync(&tca->work); + i2c_set_clientdata(client, NULL); + kfree(tca); + + return 0; +} + +static struct i2c_driver tca6507_driver = { + .driver = { + .name = "leds-tca6507", + .owner = THIS_MODULE, + }, + .probe = tca6507_probe, + .remove = __devexit_p(tca6507_remove), + .id_table = tca6507_id, +}; + +static int __init tca6507_leds_init(void) +{ + return i2c_add_driver(&tca6507_driver); +} + +static void __exit tca6507_leds_exit(void) +{ + i2c_del_driver(&tca6507_driver); +} + +module_init(tca6507_leds_init); +module_exit(tca6507_leds_exit); + +MODULE_AUTHOR("NeilBrown "); +MODULE_DESCRIPTION("TCA6507 LED/GPO driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/leds-tca6507.h b/include/linux/leds-tca6507.h new file mode 100644 index 0000000..dcabf4f --- /dev/null +++ b/include/linux/leds-tca6507.h @@ -0,0 +1,34 @@ +/* + * TCA6507 LED chip driver. + * + * Copyright (C) 2011 Neil Brown + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#ifndef __LINUX_TCA6507_H +#define __LINUX_TCA6507_H +#include + +struct tca6507_platform_data { + struct led_platform_data leds; +#ifdef CONFIG_GPIOLIB + int gpio_base; + void (*setup)(unsigned gpio_base, unsigned ngpio); +#endif +}; + +#define TCA6507_MAKE_GPIO 1 +#endif /* __LINUX_TCA6507_H*/ -- cgit v0.10.2 From 3b080945aa7670354364c8f9e1a3a07cbb97beb3 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:43 -0800 Subject: drivers/leds/leds-mc13783.c: fix off-by-one for checking num_leds The LED id begins from 0. Thus the maximum number of leds should be MC13783_LED_MAX + 1. Signed-off-by: Axel Lin Acked-by: Philippe Retornaz Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c index c61e8c4..8bc4915 100644 --- a/drivers/leds/leds-mc13783.c +++ b/drivers/leds/leds-mc13783.c @@ -275,7 +275,7 @@ static int __devinit mc13783_led_probe(struct platform_device *pdev) return -ENODEV; } - if (pdata->num_leds < 1 || pdata->num_leds > MC13783_LED_MAX) { + if (pdata->num_leds < 1 || pdata->num_leds > (MC13783_LED_MAX + 1)) { dev_err(&pdev->dev, "Invalid led count %d\n", pdata->num_leds); return -EINVAL; } -- cgit v0.10.2 From 1713cb9d6069fac581fcea928f65ca6ca7c9facf Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:45 -0800 Subject: leds: convert wm831x status driver to devm_kzalloc() Saves a small amount of code and systematically eliminates leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-wm831x-status.c b/drivers/leds/leds-wm831x-status.c index 444a68d..74a24cf 100644 --- a/drivers/leds/leds-wm831x-status.c +++ b/drivers/leds/leds-wm831x-status.c @@ -237,7 +237,8 @@ static int wm831x_status_probe(struct platform_device *pdev) goto err; } - drvdata = kzalloc(sizeof(struct wm831x_status), GFP_KERNEL); + drvdata = devm_kzalloc(&pdev->dev, sizeof(struct wm831x_status), + GFP_KERNEL); if (!drvdata) return -ENOMEM; dev_set_drvdata(&pdev->dev, drvdata); @@ -300,7 +301,6 @@ static int wm831x_status_probe(struct platform_device *pdev) err_led: led_classdev_unregister(&drvdata->cdev); - kfree(drvdata); err: return ret; } @@ -311,7 +311,6 @@ static int wm831x_status_remove(struct platform_device *pdev) device_remove_file(drvdata->cdev.dev, &dev_attr_src); led_classdev_unregister(&drvdata->cdev); - kfree(drvdata); return 0; } -- cgit v0.10.2 From c957b614bde8539416dcde8d702370ff30b1c662 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:46 -0800 Subject: leds: convert wm8350 driver to devm_kzalloc() Saves a small amount of code and systematically eliminates leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c index 390c0f6..918d4ba 100644 --- a/drivers/leds/leds-wm8350.c +++ b/drivers/leds/leds-wm8350.c @@ -227,7 +227,7 @@ static int wm8350_led_probe(struct platform_device *pdev) goto err_isink; } - led = kzalloc(sizeof(*led), GFP_KERNEL); + led = devm_kzalloc(&pdev->dev, sizeof(*led), GFP_KERNEL); if (led == NULL) { ret = -ENOMEM; goto err_dcdc; @@ -259,12 +259,10 @@ static int wm8350_led_probe(struct platform_device *pdev) ret = led_classdev_register(&pdev->dev, &led->cdev); if (ret < 0) - goto err_led; + goto err_dcdc; return 0; - err_led: - kfree(led); err_dcdc: regulator_put(dcdc); err_isink: @@ -281,7 +279,6 @@ static int wm8350_led_remove(struct platform_device *pdev) wm8350_led_disable(led); regulator_put(led->dcdc); regulator_put(led->isink); - kfree(led); return 0; } -- cgit v0.10.2 From 96b62067f970ff529c98913311d33f4b57b453dc Mon Sep 17 00:00:00 2001 From: Steve Hodgson Date: Tue, 10 Jan 2012 15:09:47 -0800 Subject: btree: export btree_get_prev() so modules can use btree_for_each The btree_for_each API is implemented with macros that internally call btree_get_prev(), so if btree_get_prev() isn't exported then modules fail to link if they try to use one of the btree_for_each macros. Since the rest of the btree API is exported, we should keep things orthogonal and make this work too. Signed-off-by: Roland Dreier Signed-off-by: Steve Hodgson Acked-by: Joern Engel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/btree.c b/lib/btree.c index 2a34392..e5ec1e9 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -357,6 +357,7 @@ miss: } return NULL; } +EXPORT_SYMBOL_GPL(btree_get_prev); static int getpos(struct btree_geo *geo, unsigned long *node, unsigned long *key) -- cgit v0.10.2 From 270c49a088ae58d4b817861bb04bfec63b0966db Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:50 -0800 Subject: checkpatch: update signature "might be better as" warning email header lines can look like signature tags. It's valid to have multiple email recipients on a single line but not valid to have multiple signatures on a single line. Validate signatures only when not in the email headers. Clear the $in_commit_log flag when the patch filename appears. Add '-' to the valid chars in a message header for headers like "Message-Id:" and "In-Reply-To:". Signed-off-by: Joe Perches Reported-by: Julia Lawall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8fda3b3..885e3b4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1504,9 +1504,11 @@ sub process { if ($line =~ /^diff --git.*?(\S+)$/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; + $in_commit_log = 0; } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; + $in_commit_log = 0; $p1_prefix = $1; if (!$file && $tree && $p1_prefix ne '' && @@ -1546,7 +1548,8 @@ sub process { } # Check signature styles - if ($line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { + if (!$in_header_lines && + $line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { my $space_before = $1; my $sign_off = $2; my $space_after = $3; @@ -1623,7 +1626,7 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - $rawline !~ /^(commit\b|from\b|\w+:).+$/i) { + $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { $in_header_lines = 0; $in_commit_log = 1; } -- cgit v0.10.2 From 5f14d3bd87ef5f979ea64c1f0862534d71786db7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:52 -0800 Subject: checkpatch: prefer __printf over __attribute__((format(printf,...))) Add a warn for not using __printf. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 885e3b4..e94626c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3114,6 +3114,12 @@ sub process { "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } +# Check for __attribute__ format(printf, prefer __printf + if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { + WARN("PREFER_PRINTF", + "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); + } + # check for sizeof(&) if ($line =~ /\bsizeof\s*\(\s*\&/) { WARN("SIZEOF_ADDRESS", -- cgit v0.10.2 From f74bd1942e04a0cedd1e9c8b331141e75add49c0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:09:54 -0800 Subject: checkpatch: correctly track the end of preprocessor commands in context When looking for a statement we currently run on through preprocessor commands. This means that a header file with just definitions is parsed over and over again combining all of the lines from the current line to the end of file leading to severe performance issues. Fix up context accumulation to track preprocessor commands and stop when reaching the end of them. At the same time vastly simplify the #define handling. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e94626c..06e22ca 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -676,6 +676,10 @@ sub ctx_statement_block { if ($off >= $len) { last; } + if ($level == 0 && substr($blk, $off) =~ /^.\s*#\s*define/) { + $level++; + $type = '#'; + } } $p = $c; $c = substr($blk, $off, 1); @@ -738,6 +742,13 @@ sub ctx_statement_block { last; } } + # Preprocessor commands end at the newline unless escaped. + if ($type eq '#' && $c eq "\n" && $p ne "\\") { + $level--; + $type = ''; + $off++; + last; + } $off++; } # We are truly at the end, so shuffle to the next line. @@ -1801,6 +1812,8 @@ sub process { $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; +#print "stat<$stat>\n"; + # Find the real next line. $realline_next = $line_nr_next; if (defined $realline_next && @@ -2781,47 +2794,13 @@ sub process { my $cnt = $realcnt; my ($off, $dstat, $dcond, $rest); my $ctx = ''; - - my $args = defined($1); - - # Find the end of the macro and limit our statement - # search to that. - while ($cnt > 0 && defined $lines[$ln - 1] && - $lines[$ln - 1] =~ /^(?:-|..*\\$)/) - { - $ctx .= $rawlines[$ln - 1] . "\n"; - $cnt-- if ($lines[$ln - 1] !~ /^-/); - $ln++; - } - $ctx .= $rawlines[$ln - 1]; - ($dstat, $dcond, $ln, $cnt, $off) = - ctx_statement_block($linenr, $ln - $linenr + 1, 0); + ctx_statement_block($linenr, $realcnt, 0); + $ctx = $dstat; #print "dstat<$dstat> dcond<$dcond> cnt<$cnt> off<$off>\n"; #print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n"; - # Extract the remainder of the define (if any) and - # rip off surrounding spaces, and trailing \'s. - $rest = ''; - while ($off != 0 || ($cnt > 0 && $rest =~ /\\\s*$/)) { - #print "ADDING cnt<$cnt> $off <" . substr($lines[$ln - 1], $off) . "> rest<$rest>\n"; - if ($off != 0 || $lines[$ln - 1] !~ /^-/) { - $rest .= substr($lines[$ln - 1], $off) . "\n"; - $cnt--; - } - $ln++; - $off = 0; - } - $rest =~ s/\\\n.//g; - $rest =~ s/^\s*//s; - $rest =~ s/\s*$//s; - - # Clean up the original statement. - if ($args) { - substr($dstat, 0, length($dcond), ''); - } else { - $dstat =~ s/^.\s*\#\s*define\s+$Ident\s*//; - } + $dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//; $dstat =~ s/$;//g; $dstat =~ s/\\\n.//g; $dstat =~ s/^\s*//s; @@ -2847,23 +2826,32 @@ sub process { ^\"|\"$ }x; #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; - if ($rest ne '' && $rest ne ',') { - if ($rest !~ /while\s*\(/ && - $dstat !~ /$exceptions/) - { - ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", - "Macros with multiple statements should be enclosed in a do - while loop\n" . "$here\n$ctx\n"); + if ($dstat ne '' && + $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), + $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); + $dstat !~ /^(?:$Ident|-?$Constant)$/ && # 10 // foo() + $dstat !~ /$exceptions/ && + $dstat !~ /^\.$Ident\s*=/ && # .foo = + $dstat !~ /^do\s*$Constant\s*while\s*$Constant;$/ && # do {...} while (...); + $dstat !~ /^for\s*$Constant$/ && # for (...) + $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() + $dstat !~ /^do\s*{/ && # do {... + $dstat !~ /^\({/) # ({... + { + $ctx =~ s/\n*$//; + my $herectx = $here . "\n"; + my $cnt = statement_rawlines($ctx); + + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; } - } elsif ($ctx !~ /;/) { - if ($dstat ne '' && - $dstat !~ /^(?:$Ident|-?$Constant)$/ && - $dstat !~ /$exceptions/ && - $dstat !~ /^\.$Ident\s*=/ && - $dstat =~ /$Operators/) - { + if ($dstat =~ /;/) { + ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", + "Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx"); + } else { ERROR("COMPLEX_MACRO", - "Macros with complex values should be enclosed in parenthesis\n" . "$here\n$ctx\n"); + "Macros with complex values should be enclosed in parenthesis\n" . "$herectx"); } } } -- cgit v0.10.2 From 554e165cf32610ec9596a183fa1b54a5707fc3cb Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:09:57 -0800 Subject: checkpatch: check for common memset parameter issues against statments Move the memset checks over to work against the statement. Also add checks for 0 and 1 used as lengths. Generally these indicate badly ordered parameters. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 06e22ca..8199d59 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3120,6 +3120,28 @@ sub process { "Avoid line continuations in quoted strings\n" . $herecurr); } +# Check for misused memsets + if (defined $stat && $stat =~ /\bmemset\s*\((.*)\)/s) { + my $args = $1; + + # Flatten any parentheses and braces + while ($args =~ s/\([^\(\)]*\)/10/s || + $args =~ s/\{[^\{\}]*\}/10/s || + $args =~ s/\[[^\[\]]*\]/10/s) + { + } + # Extract the simplified arguments. + my ($ms_addr, $ms_val, $ms_size) = + split(/\s*,\s*/, $args); + if ($ms_size =~ /^(0x|)0$/i) { + ERROR("MEMSET", + "memset size is 3rd argument, not the second.\n" . $herecurr); + } elsif ($ms_size =~ /^(0x|)1$/i) { + WARN("MEMSET", + "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . $herecurr); + } + } + # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) @@ -3291,12 +3313,6 @@ sub process { WARN("EXPORTED_WORLD_WRITABLE", "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } - - # Check for memset with swapped arguments - if ($line =~ /memset.*\,(\ |)(0x|)0(\ |0|)\);/) { - ERROR("MEMSET", - "memset size is 3rd argument, not the second.\n" . $herecurr); - } } # If we have no input at all, then there is nothing to report on -- cgit v0.10.2 From d7c76ba7e58bc3ca674f20759c686535db484749 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:58 -0800 Subject: checkpatch: improve memset and min/max with cast checking Improve the checking of arguments to memset and min/max tests. Move the checking of min/max to statement blocks instead of single line. Change $Constant to allow any case type 0x initiator and trailing ul specifier. Add $FuncArg type as any function argument with or without a cast. Print the whole statement when showing memset or min/max messages. Improve the memset with 0 as 3rd argument error message. There are still weaknesses in the $FuncArg and $Constant code as arbitrary parentheses and negative signs are not generically supported. [akpm@linux-foundation.org: fix per Andy] Signed-off-by: Joe Perches Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8199d59..4c53d6f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -227,7 +227,7 @@ our $Inline = qr{inline|__always_inline|noinline}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; -our $Constant = qr{(?:[0-9]+|0x[0-9a-fA-F]+)[UL]*}; +our $Constant = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ @@ -334,6 +334,7 @@ our $match_balanced_parentheses = qr/(\((?:[^\(\)]+|(-1))*\))/; our $Typecast = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*}; our $LvalOrFunc = qr{($Lval)\s*($match_balanced_parentheses{0,1})\s*}; +our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { my ($string) = @_; @@ -2609,28 +2610,6 @@ sub process { } } -# typecasts on min/max could be min_t/max_t - if ($line =~ /^\+(?:.*?)\b(min|max)\s*\($Typecast{0,1}($LvalOrFunc)\s*,\s*$Typecast{0,1}($LvalOrFunc)\s*\)/) { - if (defined $2 || defined $8) { - my $call = $1; - my $cast1 = deparenthesize($2); - my $arg1 = $3; - my $cast2 = deparenthesize($8); - my $arg2 = $9; - my $cast; - - if ($cast1 ne "" && $cast2 ne "") { - $cast = "$cast1 or $cast2"; - } elsif ($cast1 ne "") { - $cast = $cast1; - } else { - $cast = $cast2; - } - WARN("MINMAX", - "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . $herecurr); - } - } - # Need a space before open parenthesis after if, while etc if ($line=~/\b(if|while|for|switch)\(/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); @@ -3121,24 +3100,42 @@ sub process { } # Check for misused memsets - if (defined $stat && $stat =~ /\bmemset\s*\((.*)\)/s) { - my $args = $1; + if (defined $stat && + $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/s) { + + my $ms_addr = $2; + my $ms_val = $8; + my $ms_size = $14; - # Flatten any parentheses and braces - while ($args =~ s/\([^\(\)]*\)/10/s || - $args =~ s/\{[^\{\}]*\}/10/s || - $args =~ s/\[[^\[\]]*\]/10/s) - { - } - # Extract the simplified arguments. - my ($ms_addr, $ms_val, $ms_size) = - split(/\s*,\s*/, $args); if ($ms_size =~ /^(0x|)0$/i) { ERROR("MEMSET", - "memset size is 3rd argument, not the second.\n" . $herecurr); + "memset to 0's uses 0 as the 2nd argument, not the 3rd\n" . "$here\n$stat\n"); } elsif ($ms_size =~ /^(0x|)1$/i) { WARN("MEMSET", - "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . $herecurr); + "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . "$here\n$stat\n"); + } + } + +# typecasts on min/max could be min_t/max_t + if (defined $stat && + $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) { + if (defined $2 || defined $8) { + my $call = $1; + my $cast1 = deparenthesize($2); + my $arg1 = $3; + my $cast2 = deparenthesize($8); + my $arg2 = $9; + my $cast; + + if ($cast1 ne "" && $cast2 ne "") { + $cast = "$cast1 or $cast2"; + } elsif ($cast1 ne "") { + $cast = $cast1; + } else { + $cast = $cast2; + } + WARN("MINMAX", + "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . "$here\n$stat\n"); } } -- cgit v0.10.2 From 89a883530fe79939384a6c6ed893c719762c7c9c Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:00 -0800 Subject: checkpatch: ## is not a valid modifier Inserting a # into the modifiers list will incorrectly add the null string to the modifiers list, leading to an infinite loop. As neither of these is a valid modifier form simply ignore them. Signed-off-by: Andy Whitcroft Reported-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c53d6f..b4390cf 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1224,7 +1224,9 @@ sub possible { case| else| asm|__asm__| - do + do| + \#| + \#\#| )(?:\s|$)| ^(?:typedef|struct|enum)\b )}x; -- cgit v0.10.2 From 3e469cdc08ac5d84b220f8fb76a090d158d5114f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:01 -0800 Subject: checkpatch: optimise statement scanner when mid-statement In the middle of a long definition or similar, there is no possibility of finding a smaller sub-statement. Optimise this case by skipping statement aquirey where there are no starts of statement (open brace '{' or semi-colon ';'). We are likely to scan slightly more than needed still but this is safest. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b4390cf..618c0b5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1373,6 +1373,7 @@ sub process { my %suppress_ifbraces; my %suppress_whiletrailers; my %suppress_export; + my $suppress_statement = 0; # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. @@ -1482,6 +1483,7 @@ sub process { %suppress_ifbraces = (); %suppress_whiletrailers = (); %suppress_export = (); + $suppress_statement = 0; next; # track the line number as we move through the hunk, note that @@ -1809,13 +1811,23 @@ sub process { # Check for potential 'bare' types my ($stat, $cond, $line_nr_next, $remain_next, $off_next, $realline_next); - if ($realcnt && $line =~ /.\s*\S/) { +#print "LINE<$line>\n"; + if ($linenr >= $suppress_statement && + $realcnt && $line =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; -#print "stat<$stat>\n"; +#print "linenr<$linenr> <$stat>\n"; + # If this statement has no statement boundaries within + # it there is no point in retrying a statement scan + # until we hit end of it. + my $frag = $stat; $frag =~ s/;+\s*$//; + if ($frag !~ /(?:{|;)/) { +#print "skip<$line_nr_next>\n"; + $suppress_statement = $line_nr_next; + } # Find the real next line. $realline_next = $line_nr_next; @@ -1942,6 +1954,9 @@ sub process { # Check relative indent for conditionals and blocks. if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { + ($stat, $cond, $line_nr_next, $remain_next, $off_next) = + ctx_statement_block($linenr, $realcnt, 0) + if (!defined $stat); my ($s, $c) = ($stat, $cond); substr($s, 0, length($c), ''); @@ -2620,6 +2635,9 @@ sub process { # Check for illegal assignment in if conditional -- and check for trailing # statements after the conditional. if ($line =~ /do\s*(?!{)/) { + ($stat, $cond, $line_nr_next, $remain_next, $off_next) = + ctx_statement_block($linenr, $realcnt, 0) + if (!defined $stat); my ($stat_next) = ctx_statement_block($line_nr_next, $remain_next, $off_next); $stat_next =~ s/\n./\n /g; -- cgit v0.10.2 From a13858033a3a993147d190317cc9d709f0a1b819 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:03 -0800 Subject: checkpatch: only apply kconfig help checks for options which prompt The intent of this check is to catch the options which the user will see and ensure they are properly described. It is also common for internal only options to have a brief description. Allow this form. Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 618c0b5..d8ac16a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1674,19 +1674,26 @@ sub process { # Only applies when adding the entry originally, after that we do not have # sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /\+\s*(?:---)?help(?:---)?$/) { + $line =~ /.\s*config\s+/) { my $length = 0; my $cnt = $realcnt; my $ln = $linenr + 1; my $f; + my $is_start = 0; my $is_end = 0; - while ($cnt > 0 && defined $lines[$ln - 1]) { + for (; $cnt > 0 && defined $lines[$ln - 1]; $ln++) { $f = $lines[$ln - 1]; $cnt-- if ($lines[$ln - 1] !~ /^-/); $is_end = $lines[$ln - 1] =~ /^\+/; - $ln++; next if ($f =~ /^-/); + + if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) { + $is_start = 1; + } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) { + $length = -1; + } + $f =~ s/^.//; $f =~ s/#.*//; $f =~ s/^\s+//; @@ -1698,8 +1705,8 @@ sub process { $length++; } WARN("CONFIG_DESCRIPTION", - "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_end && $length < 4); - #print "is_end<$is_end> length<$length>\n"; + "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_start && $is_end && $length < 4); + #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) && -- cgit v0.10.2 From 87a53877185627b49a903023255425bda78f890c Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:04 -0800 Subject: checkpatch: fix EXPORT_SYMBOL handling following a function The following fragment defeats the DEVICE_ATTR style handing, check for and ignore the close brace '}' in this context: int foo() { } DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR, ata_scsi_lpm_show, ata_scsi_lpm_put); EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy); Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d8ac16a..afc656d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2131,7 +2131,7 @@ sub process { # XXX(foo); # EXPORT_SYMBOL(something_foo); my $name = $1; - if ($stat =~ /^.([A-Z_]+)\s*\(\s*($Ident)/ && + if ($stat =~ /^(?:.\s*}\s*\n)?.([A-Z_]+)\s*\(\s*($Ident)/ && $name =~ /^${Ident}_$2/) { #print "FOO C name<$name>\n"; $suppress_export{$realline_next} = 1; -- cgit v0.10.2 From 72f115f94d500fc72f78c5df8104a98f8b9cc273 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:06 -0800 Subject: checkpatch: complex macro should allow the empty do while loop It is common to stub out a function as below, this is triggering a complex macro format incorrectly. Sort this out: #define cma_early_regions_reserve(reserve) do { } while (0) Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index afc656d..ca6d0fb 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2838,7 +2838,7 @@ sub process { $dstat !~ /^(?:$Ident|-?$Constant)$/ && # 10 // foo() $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = - $dstat !~ /^do\s*$Constant\s*while\s*$Constant;$/ && # do {...} while (...); + $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...) $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... -- cgit v0.10.2 From e01886ada28741d7cb2cfb3224e9caccfbc1a2d5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:08 -0800 Subject: checkpatch: fix 'return is not a function' square bracket handling We are incorrectly matching square brackets '[' and ']' leading to false positives on more complex functions as below: return (dt3155_fbuffer[m]->ready_head - dt3155_fbuffer[m]->ready_len + dt3155_fbuffer[m]->nbuffers)% (dt3155_fbuffer[m]->nbuffers); Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ca6d0fb..5e3f419 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2609,7 +2609,7 @@ sub process { # Flatten any parentheses $value =~ s/\(/ \(/g; $value =~ s/\)/\) /g; - while ($value =~ s/\[[^\{\}]*\]/1/ || + while ($value =~ s/\[[^\[\]]*\]/1/ || $value !~ /(?:$Ident|-?$Constant)\s* $Compare\s* (?:$Ident|-?$Constant)/x && -- cgit v0.10.2 From c81769fdc84ed7c6eb3cc5cecb194324a5e4c8ad Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:10 -0800 Subject: checkpatch: fix complex macros handling of square brackets Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5e3f419..ba7bcf3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2815,7 +2815,7 @@ sub process { # Flatten any parentheses and braces while ($dstat =~ s/\([^\(\)]*\)/1/ || $dstat =~ s/\{[^\{\}]*\}/1/ || - $dstat =~ s/\[[^\{\}]*\]/1/) + $dstat =~ s/\[[^\[\]]*\]/1/) { } -- cgit v0.10.2 From addcdcea99514bee64b5bf091ac9fd2fc5da65cf Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:11 -0800 Subject: checkpatch: ensure cast type is unique in the context parser Ensure the cast type is unique in the context parser, we do not want them to detect as a comma ','. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ba7bcf3..497416c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1032,7 +1032,7 @@ sub annotate_values { } elsif ($cur =~ /^(\(\s*$Type\s*)\)/ && $av_pending eq '_') { print "CAST($1)\n" if ($dbg_values > 1); push(@av_paren_type, $type); - $type = 'C'; + $type = 'c'; } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); -- cgit v0.10.2 From 6b48db24e30d371bc54566667b82ca3d64aab80a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:13 -0800 Subject: checkpatch: typeof may have more complex arguments typeof may have various more complex forms as its arguement, not just an identifier. For now allow us to leak to the first close perenthesis ')'. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 497416c..eb4b559 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -315,7 +315,7 @@ sub build_types { $NonptrType = qr{ (?:$Modifier\s+|const\s+)* (?: - (?:typeof|__typeof__)\s*\(\s*\**\s*$Ident\s*\)| + (?:typeof|__typeof__)\s*\([^\)]*\)| (?:$typeTypedefs\b)| (?:${all}\b) ) -- cgit v0.10.2 From bfcb2cc798a14230d22b6dd999e2e680623de622 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:15 -0800 Subject: checkpatch: catch all occurences of type and cast spacing errors per line Fix up type and cast spacing checks such that all occurences on a line are examined and reported. For example the line below has a valid cast and a bad type, but currently we check the cast first which is good and stop: u16* bar = (u16 *)baz; We will also only report one of the errors in this example: u16* bar = (u16*)bad; Move to iterating across all casts and all types, reporting any failure. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index eb4b559..e3bfcbe 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2209,8 +2209,9 @@ sub process { # * goes on variable not on type # (char*[ const]) - if ($line =~ m{\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\)}) { - my ($from, $to) = ($1, $1); + while ($line =~ m{(\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\))}g) { + #print "AA<$1>\n"; + my ($from, $to) = ($2, $2); # Should start with a space. $to =~ s/^(\S)/ $1/; @@ -2225,8 +2226,10 @@ sub process { ERROR("POINTER_LOCATION", "\"(foo$from)\" should be \"(foo$to)\"\n" . $herecurr); } - } elsif ($line =~ m{\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident)}) { - my ($from, $to, $ident) = ($1, $1, $2); + } + while ($line =~ m{(\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident))}g) { + #print "BB<$1>\n"; + my ($from, $to, $ident) = ($2, $2, $3); # Should start with a space. $to =~ s/^(\S)/ $1/; -- cgit v0.10.2 From 5742332dea5560d6c449b007d9539dbdc8ee531b Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Tue, 10 Jan 2012 15:10:18 -0800 Subject: crc32: optimize inner loop Taking a pointer reference to each row in the crc table matrix, one can reduce the inner loop with a few insn's Signed-off-by: Joakim Tjernlund Cc: Bob Pearson Cc: Frank Zago Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/crc32.c b/lib/crc32.c index a6e633a..4b35d2b 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -51,20 +51,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ - tab[2][(crc >> 8) & 255] ^ \ - tab[1][(crc >> 16) & 255] ^ \ - tab[0][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) +# define DO_CRC4 crc = t3[(crc) & 255] ^ \ + t2[(crc >> 8) & 255] ^ \ + t1[(crc >> 16) & 255] ^ \ + t0[(crc >> 24) & 255] # else -# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ - tab[1][(crc >> 8) & 255] ^ \ - tab[2][(crc >> 16) & 255] ^ \ - tab[3][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) +# define DO_CRC4 crc = t0[(crc) & 255] ^ \ + t1[(crc >> 8) & 255] ^ \ + t2[(crc >> 16) & 255] ^ \ + t3[(crc >> 24) & 255] # endif const u32 *b; size_t rem_len; + const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; /* Align it */ if (unlikely((long)buf & 3 && len)) { -- cgit v0.10.2 From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:10:21 -0800 Subject: fs: binfmt_elf: create Kconfig variable for PIE randomization Randomization of PIE load address is hard coded in binfmt_elf.c for X86 and ARM. Create a new Kconfig variable (CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead. Thus architecture specific policy is pushed out of the generic binfmt_elf.c and into the architecture Kconfig files. X86 and ARM Kconfigs are modified to select the new variable so there is no change in behavior. A follow on patch will select it for MIPS too. Signed-off-by: David Daney Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Acked-by: H. Peter Anvin Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9d66dfc..98a6459 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -16,6 +16,7 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL) select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d2a69d..d6ddc0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7..e95d1b6 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF +config ARCH_BINFMT_ELF_RANDOMIZE_PIE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 21ac5ee..bcb884e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#if defined(CONFIG_X86) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off * in runtime via sysctl. * If that is the case, retain the original non-zero -- cgit v0.10.2 From e26d196cc88c5436ca93532cabce87dbd8bf49b4 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:10:22 -0800 Subject: MIPS: randomize PIE load address ... by selecting ARCH_BINFMT_ELF_RANDOMIZE_PIE Signed-off-by: David Daney Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: H. Peter Anvin Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a7636d3..c529cfe 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -16,6 +16,7 @@ config MIPS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_KPROBES select HAVE_KRETPROBES + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT select HAVE_DMA_ATTRS -- cgit v0.10.2 From b43c1ea4d622b6951377de92edfb219d893e23ef Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 10 Jan 2012 15:10:26 -0800 Subject: drivers/rtc/rtc-cmos.c: fix broken NVRAM bank 2 writing Fix writing to NVRAM bank 2 in rtc-cmos driver. It never worked since its introduction in 2.6.28 because of a typo. Signed-off-by: Ondrej Zary Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 05beb6c..d7782aa 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -164,7 +164,7 @@ static inline unsigned char cmos_read_bank2(unsigned char addr) static inline void cmos_write_bank2(unsigned char val, unsigned char addr) { outb(addr, RTC_PORT(2)); - outb(val, RTC_PORT(2)); + outb(val, RTC_PORT(3)); } #else -- cgit v0.10.2 From 7287be1d0ac8c82999b67c2a33517c6ec9cfdbe7 Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 10 Jan 2012 15:10:32 -0800 Subject: drivers/rtc/rtc-mxc.c: fix setting time for MX1 SoC There is no way to track year in the i.MX1 RTC: Days Counter register is 9-bit wide only. Attempt to save date after 1970-01-01 plus 512 days causes endless loop in mxc_rtc_set_mmss(). Fix this by resetting year to 1970. [akpm@linux-foundation.org: use conventional comment layout] Signed-off-by: Yauhen Kharuzhy Cc: Daniel Mack Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 39e41fb..11b7b614 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -290,6 +290,17 @@ static int mxc_rtc_read_time(struct device *dev, struct rtc_time *tm) */ static int mxc_rtc_set_mmss(struct device *dev, unsigned long time) { + /* + * TTC_DAYR register is 9-bit in MX1 SoC, save time and day of year only + */ + if (cpu_is_mx1()) { + struct rtc_time tm; + + rtc_time_to_tm(time, &tm); + tm.tm_year = 70; + rtc_tm_to_time(&tm, &time); + } + /* Avoid roll-over from reading the different registers */ do { set_alarm_or_time(dev, MXC_RTC_TIME, time); -- cgit v0.10.2 From c92182ee0b5a33c74e4b6c0ded36166e4ef3bc3e Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 10 Jan 2012 15:10:34 -0800 Subject: drivers/rtc/rtc-mxc.c: make alarm work Fix alarm IRQ handling, make the alarm one-shot. Cleanup black magick with a validation of already validated time data. Add ability to wake the system with alarm. [akpm@linux-foundation.org: fix CONFIG_PM=n build] Signed-off-by: Yauhen Kharuzhy Cc: Daniel Mack Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 11b7b614..5e1d64e 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -155,7 +155,6 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm) { struct rtc_time alarm_tm, now_tm; unsigned long now, time; - int ret; struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); void __iomem *ioaddr = pdata->ioaddr; @@ -168,21 +167,33 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm) alarm_tm.tm_hour = alrm->tm_hour; alarm_tm.tm_min = alrm->tm_min; alarm_tm.tm_sec = alrm->tm_sec; - rtc_tm_to_time(&now_tm, &now); rtc_tm_to_time(&alarm_tm, &time); - if (time < now) { - time += 60 * 60 * 24; - rtc_time_to_tm(time, &alarm_tm); - } - - ret = rtc_tm_to_time(&alarm_tm, &time); - /* clear all the interrupt status bits */ writew(readw(ioaddr + RTC_RTCISR), ioaddr + RTC_RTCISR); set_alarm_or_time(dev, MXC_RTC_ALARM, time); - return ret; + return 0; +} + +static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit, + unsigned int enabled) +{ + struct platform_device *pdev = to_platform_device(dev); + struct rtc_plat_data *pdata = platform_get_drvdata(pdev); + void __iomem *ioaddr = pdata->ioaddr; + u32 reg; + + spin_lock_irq(&pdata->rtc->irq_lock); + reg = readw(ioaddr + RTC_RTCIENR); + + if (enabled) + reg |= bit; + else + reg &= ~bit; + + writew(reg, ioaddr + RTC_RTCIENR); + spin_unlock_irq(&pdata->rtc->irq_lock); } /* This function is the RTC interrupt service routine. */ @@ -199,13 +210,12 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) /* clear interrupt sources */ writew(status, ioaddr + RTC_RTCISR); - /* clear alarm interrupt if it has occurred */ - if (status & RTC_ALM_BIT) - status &= ~RTC_ALM_BIT; - /* update irq data & counter */ - if (status & RTC_ALM_BIT) + if (status & RTC_ALM_BIT) { events |= (RTC_AF | RTC_IRQF); + /* RTC alarm should be one-shot */ + mxc_rtc_irq_enable(&pdev->dev, RTC_ALM_BIT, 0); + } if (status & RTC_1HZ_BIT) events |= (RTC_UF | RTC_IRQF); @@ -213,9 +223,6 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) if (status & PIT_ALL_ON) events |= (RTC_PF | RTC_IRQF); - if ((status & RTC_ALM_BIT) && rtc_valid_tm(&pdata->g_rtc_alarm)) - rtc_update_alarm(&pdev->dev, &pdata->g_rtc_alarm); - rtc_update_irq(pdata->rtc, 1, events); spin_unlock_irq(&pdata->rtc->irq_lock); @@ -242,26 +249,6 @@ static void mxc_rtc_release(struct device *dev) spin_unlock_irq(&pdata->rtc->irq_lock); } -static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - void __iomem *ioaddr = pdata->ioaddr; - u32 reg; - - spin_lock_irq(&pdata->rtc->irq_lock); - reg = readw(ioaddr + RTC_RTCIENR); - - if (enabled) - reg |= bit; - else - reg &= ~bit; - - writew(reg, ioaddr + RTC_RTCIENR); - spin_unlock_irq(&pdata->rtc->irq_lock); -} - static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { mxc_rtc_irq_enable(dev, RTC_ALM_BIT, enabled); @@ -335,21 +322,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); int ret; - if (rtc_valid_tm(&alrm->time)) { - if (alrm->time.tm_sec > 59 || - alrm->time.tm_hour > 23 || - alrm->time.tm_min > 59) - return -EINVAL; - - ret = rtc_update_alarm(dev, &alrm->time); - } else { - ret = rtc_valid_tm(&alrm->time); - if (ret) - return ret; - - ret = rtc_update_alarm(dev, &alrm->time); - } - + ret = rtc_update_alarm(dev, &alrm->time); if (ret) return ret; @@ -435,6 +408,9 @@ static int __init mxc_rtc_probe(struct platform_device *pdev) pdata->irq = -1; } + if (pdata->irq >=0) + device_init_wakeup(&pdev->dev, 1); + rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { @@ -470,9 +446,39 @@ static int __exit mxc_rtc_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_PM +static int mxc_rtc_suspend(struct device *dev) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + enable_irq_wake(pdata->irq); + + return 0; +} + +static int mxc_rtc_resume(struct device *dev) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + disable_irq_wake(pdata->irq); + + return 0; +} + +static struct dev_pm_ops mxc_rtc_pm_ops = { + .suspend = mxc_rtc_suspend, + .resume = mxc_rtc_resume, +}; +#endif + static struct platform_driver mxc_rtc_driver = { .driver = { .name = "mxc_rtc", +#ifdef CONFIG_PM + .pm = &mxc_rtc_pm_ops, +#endif .owner = THIS_MODULE, }, .remove = __exit_p(mxc_rtc_remove), -- cgit v0.10.2 From 10d065e65b0be33e868f9c6da67026b5111480d8 Mon Sep 17 00:00:00 2001 From: Robert Marklund Date: Tue, 10 Jan 2012 15:10:35 -0800 Subject: rtc/ab8500: don't disable IRQ:s when suspending We want this driver to be able to wake up the system. Signed-off-by: Robert Marklund Signed-off-by: Linus Walleij Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index e346705..d6bfc4f 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -316,8 +316,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return err; } - err = request_threaded_irq(irq, NULL, rtc_alarm_handler, 0, - "ab8500-rtc", rtc); + err = request_threaded_irq(irq, NULL, rtc_alarm_handler, + IRQF_NO_SUSPEND, "ab8500-rtc", rtc); if (err < 0) { rtc_device_unregister(rtc); return err; -- cgit v0.10.2 From b62581e6241c33b9fef45117f86830058738371f Mon Sep 17 00:00:00 2001 From: Andrew Lynn Date: Tue, 10 Jan 2012 15:10:38 -0800 Subject: rtc/ab8500: set can_wake flag Set can_wake flag so wakealarm property is visible in sysfs. Signed-off-by: Andrew Lynn Reviewed-by: Jonas ABERG Signed-off-by: Linus Walleij Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index d6bfc4f..82a3480 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -308,6 +308,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return -ENODEV; } + device_init_wakeup(&pdev->dev, true); + rtc = rtc_device_register("ab8500-rtc", &pdev->dev, &ab8500_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { -- cgit v0.10.2 From 012e52e15e7ebbc7b08165e8f4b10f71a3e6810b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 10 Jan 2012 15:10:41 -0800 Subject: drivers/rtc/rtc-ab8500.c: change msleep() to usleep_range() The resolution of msleep is related to HZ, so with HZ set to 100 any msleep of less than 10ms will become ~10ms. This is not what we want. Use the hrtimer-based usleep_range() and allow for some slack in the non-critical path so we have more control of what is happening here. Signed-off-by: Linus Walleij Cc: Jonas Aaberg Cc: Alessandro Zummo Cc: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 82a3480..919b2e5 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -90,7 +90,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm) /* Early AB8500 chips will not clear the rtc read request bit */ if (abx500_get_chip_id(dev) == 0) { - msleep(1); + usleep_range(1000, 1000); } else { /* Wait for some cycles after enabling the rtc read in ab8500 */ while (time_before(jiffies, timeout)) { @@ -102,7 +102,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm) if (!(value & RTC_READ_REQUEST)) break; - msleep(1); + usleep_range(1000, 5000); } } @@ -295,7 +295,7 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return err; /* Wait for reset by the PorRtc */ - msleep(1); + usleep_range(1000, 5000); err = abx500_get_register_interruptible(&pdev->dev, AB8500_RTC, AB8500_RTC_STAT_REG, &rtc_ctrl); -- cgit v0.10.2 From dda367ac064d7473d397b1965019fb3be7cfb6b0 Mon Sep 17 00:00:00 2001 From: Mark Godfrey Date: Tue, 10 Jan 2012 15:10:42 -0800 Subject: rtc/ab8500: add calibration attribute to AB8500 RTC The rtc_calibration attribute allows user-space to get and set the AB8500's RtcCalibration register. The AB8500 will then use the value in this register to compensate for RTC drift every 60 seconds. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mark Godfrey Signed-off-by: Linus Walleij Acked-by: Jean-Christophe PLAGNIOL-VILLARD Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration b/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration new file mode 100644 index 0000000..4cf1e72 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration @@ -0,0 +1,12 @@ +What: Attribute for calibrating ST-Ericsson AB8500 Real Time Clock +Date: Oct 2011 +KernelVersion: 3.0 +Contact: Mark Godfrey +Description: The rtc_calibration attribute allows the userspace to + calibrate the AB8500.s 32KHz Real Time Clock. + Every 60 seconds the AB8500 will correct the RTC's value + by adding to it the value of this attribute. + The range of the attribute is -127 to +127 in units of + 30.5 micro-seconds (half-parts-per-million of the 32KHz clock) +Users: The /vendor/st-ericsson/base_utilities/core/rtc_calibration + daemon uses this interface. diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 919b2e5..df7bfc3 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -258,6 +258,109 @@ static int ab8500_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) return ab8500_rtc_irq_enable(dev, alarm->enabled); } + +static int ab8500_rtc_set_calibration(struct device *dev, int calibration) +{ + int retval; + u8 rtccal = 0; + + /* + * Check that the calibration value (which is in units of 0.5 + * parts-per-million) is in the AB8500's range for RtcCalibration + * register. -128 (0x80) is not permitted because the AB8500 uses + * a sign-bit rather than two's complement, so 0x80 is just another + * representation of zero. + */ + if ((calibration < -127) || (calibration > 127)) { + dev_err(dev, "RtcCalibration value outside permitted range\n"); + return -EINVAL; + } + + /* + * The AB8500 uses sign (in bit7) and magnitude (in bits0-7) + * so need to convert to this sort of representation before writing + * into RtcCalibration register... + */ + if (calibration >= 0) + rtccal = 0x7F & calibration; + else + rtccal = ~(calibration - 1) | 0x80; + + retval = abx500_set_register_interruptible(dev, AB8500_RTC, + AB8500_RTC_CALIB_REG, rtccal); + + return retval; +} + +static int ab8500_rtc_get_calibration(struct device *dev, int *calibration) +{ + int retval; + u8 rtccal = 0; + + retval = abx500_get_register_interruptible(dev, AB8500_RTC, + AB8500_RTC_CALIB_REG, &rtccal); + if (retval >= 0) { + /* + * The AB8500 uses sign (in bit7) and magnitude (in bits0-7) + * so need to convert value from RtcCalibration register into + * a two's complement signed value... + */ + if (rtccal & 0x80) + *calibration = 0 - (rtccal & 0x7F); + else + *calibration = 0x7F & rtccal; + } + + return retval; +} + +static ssize_t ab8500_sysfs_store_rtc_calibration(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int retval; + int calibration = 0; + + if (sscanf(buf, " %i ", &calibration) != 1) { + dev_err(dev, "Failed to store RTC calibration attribute\n"); + return -EINVAL; + } + + retval = ab8500_rtc_set_calibration(dev, calibration); + + return retval ? retval : count; +} + +static ssize_t ab8500_sysfs_show_rtc_calibration(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int retval = 0; + int calibration = 0; + + retval = ab8500_rtc_get_calibration(dev, &calibration); + if (retval < 0) { + dev_err(dev, "Failed to read RTC calibration attribute\n"); + sprintf(buf, "0\n"); + return retval; + } + + return sprintf(buf, "%d\n", calibration); +} + +static DEVICE_ATTR(rtc_calibration, S_IRUGO | S_IWUSR, + ab8500_sysfs_show_rtc_calibration, + ab8500_sysfs_store_rtc_calibration); + +static int ab8500_sysfs_rtc_register(struct device *dev) +{ + return device_create_file(dev, &dev_attr_rtc_calibration); +} + +static void ab8500_sysfs_rtc_unregister(struct device *dev) +{ + device_remove_file(dev, &dev_attr_rtc_calibration); +} + static irqreturn_t rtc_alarm_handler(int irq, void *data) { struct rtc_device *rtc = data; @@ -327,6 +430,13 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); + + err = ab8500_sysfs_rtc_register(&pdev->dev); + if (err) { + dev_err(&pdev->dev, "sysfs RTC failed to register\n"); + return err; + } + return 0; } @@ -335,6 +445,8 @@ static int __devexit ab8500_rtc_remove(struct platform_device *pdev) struct rtc_device *rtc = platform_get_drvdata(pdev); int irq = platform_get_irq_byname(pdev, "ALARM"); + ab8500_sysfs_rtc_unregister(&pdev->dev); + free_irq(irq, rtc); rtc_device_unregister(rtc); platform_set_drvdata(pdev, NULL); -- cgit v0.10.2 From 2d65943e55bdd538640d0908bc9f3ead138b0431 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:10:43 -0800 Subject: drivers/rtc/rtc-wm831x.c: remove unused period IRQ handler Due to changes in the RTC core the period interrupt is now unused so delete the code managing it. Signed-off-by: Mark Brown Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index bdc909b..dabbd45 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -324,15 +324,6 @@ static irqreturn_t wm831x_alm_irq(int irq, void *data) return IRQ_HANDLED; } -static irqreturn_t wm831x_per_irq(int irq, void *data) -{ - struct wm831x_rtc *wm831x_rtc = data; - - rtc_update_irq(wm831x_rtc->rtc, 1, RTC_IRQF | RTC_UF); - - return IRQ_HANDLED; -} - static const struct rtc_class_ops wm831x_rtc_ops = { .read_time = wm831x_rtc_readtime, .set_mmss = wm831x_rtc_set_mmss, @@ -405,7 +396,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_rtc *wm831x_rtc; - int per_irq = platform_get_irq_byname(pdev, "PER"); int alm_irq = platform_get_irq_byname(pdev, "ALM"); int ret = 0; @@ -433,14 +423,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) goto err; } - ret = request_threaded_irq(per_irq, NULL, wm831x_per_irq, - IRQF_TRIGGER_RISING, "RTC period", - wm831x_rtc); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to request periodic IRQ %d: %d\n", - per_irq, ret); - } - ret = request_threaded_irq(alm_irq, NULL, wm831x_alm_irq, IRQF_TRIGGER_RISING, "RTC alarm", wm831x_rtc); @@ -459,11 +441,9 @@ err: static int __devexit wm831x_rtc_remove(struct platform_device *pdev) { struct wm831x_rtc *wm831x_rtc = platform_get_drvdata(pdev); - int per_irq = platform_get_irq_byname(pdev, "PER"); int alm_irq = platform_get_irq_byname(pdev, "ALM"); free_irq(alm_irq, wm831x_rtc); - free_irq(per_irq, wm831x_rtc); rtc_device_unregister(wm831x_rtc->rtc); kfree(wm831x_rtc); -- cgit v0.10.2 From 5f85d20d04cdc4c6ed15022a5ed76907ad88d4ae Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:10:44 -0800 Subject: drivers/rtc/rtc-wm831x.c: convert to devm_kzalloc() Marginally less code and eliminate the possibility of memory leaks. Signed-off-by: Mark Brown Acked-by: Jean-Christophe PLAGNIOL-VILLARD Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index dabbd45..657c6f6 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -399,7 +399,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev) int alm_irq = platform_get_irq_byname(pdev, "ALM"); int ret = 0; - wm831x_rtc = kzalloc(sizeof(*wm831x_rtc), GFP_KERNEL); + wm831x_rtc = devm_kzalloc(&pdev->dev, sizeof(*wm831x_rtc), GFP_KERNEL); if (wm831x_rtc == NULL) return -ENOMEM; @@ -434,7 +434,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) return 0; err: - kfree(wm831x_rtc); return ret; } @@ -445,7 +444,6 @@ static int __devexit wm831x_rtc_remove(struct platform_device *pdev) free_irq(alm_irq, wm831x_rtc); rtc_device_unregister(wm831x_rtc->rtc); - kfree(wm831x_rtc); return 0; } -- cgit v0.10.2 From 0c4eae66591a292fee70051ea363a8d27aa54102 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:48 -0800 Subject: rtc: convert drivers/rtc/* to use module_platform_driver() This patch converts the drivers in drivers/rtc/* to use the module_platform_driver() macro which makes the code smaller and a bit simpler. Signed-off-by: Axel Lin Acked-by: Mark Brown Acked-by: Mike Frysinger Acked-by: Guan Xuetao Acked-by: Linus Walleij Acked-by: Haojian Zhuang Cc: Alessandro Zummo Cc: Srinidhi Kasagar Cc: Lars-Peter Clausen Cc: Ben Dooks Cc: John Stultz Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c index 64b847b..f04761e 100644 --- a/drivers/rtc/rtc-88pm860x.c +++ b/drivers/rtc/rtc-88pm860x.c @@ -410,17 +410,7 @@ static struct platform_driver pm860x_rtc_driver = { .remove = __devexit_p(pm860x_rtc_remove), }; -static int __init pm860x_rtc_init(void) -{ - return platform_driver_register(&pm860x_rtc_driver); -} -module_init(pm860x_rtc_init); - -static void __exit pm860x_rtc_exit(void) -{ - platform_driver_unregister(&pm860x_rtc_driver); -} -module_exit(pm860x_rtc_exit); +module_platform_driver(pm860x_rtc_driver); MODULE_DESCRIPTION("Marvell 88PM860x RTC driver"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index df7bfc3..a0a9810 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -463,18 +463,8 @@ static struct platform_driver ab8500_rtc_driver = { .remove = __devexit_p(ab8500_rtc_remove), }; -static int __init ab8500_rtc_init(void) -{ - return platform_driver_register(&ab8500_rtc_driver); -} - -static void __exit ab8500_rtc_exit(void) -{ - platform_driver_unregister(&ab8500_rtc_driver); -} +module_platform_driver(ab8500_rtc_driver); -module_init(ab8500_rtc_init); -module_exit(ab8500_rtc_exit); MODULE_AUTHOR("Virupax Sadashivpetimath "); MODULE_DESCRIPTION("AB8500 RTC Driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index 90d8662..abfc1a0 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -456,18 +456,7 @@ static struct platform_driver bfin_rtc_driver = { .resume = bfin_rtc_resume, }; -static int __init bfin_rtc_init(void) -{ - return platform_driver_register(&bfin_rtc_driver); -} - -static void __exit bfin_rtc_exit(void) -{ - platform_driver_unregister(&bfin_rtc_driver); -} - -module_init(bfin_rtc_init); -module_exit(bfin_rtc_exit); +module_platform_driver(bfin_rtc_driver); MODULE_DESCRIPTION("Blackfin On-Chip Real Time Clock Driver"); MODULE_AUTHOR("Mike Frysinger "); diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index 128270c..bf612ef 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -218,15 +218,4 @@ static struct platform_driver bq4802_driver = { .remove = __devexit_p(bq4802_remove), }; -static int __init bq4802_init(void) -{ - return platform_driver_register(&bq4802_driver); -} - -static void __exit bq4802_exit(void) -{ - platform_driver_unregister(&bq4802_driver); -} - -module_init(bq4802_init); -module_exit(bq4802_exit); +module_platform_driver(bq4802_driver); diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c index 2322c43..d4457af 100644 --- a/drivers/rtc/rtc-dm355evm.c +++ b/drivers/rtc/rtc-dm355evm.c @@ -161,16 +161,6 @@ static struct platform_driver rtc_dm355evm_driver = { }, }; -static int __init dm355evm_rtc_init(void) -{ - return platform_driver_register(&rtc_dm355evm_driver); -} -module_init(dm355evm_rtc_init); - -static void __exit dm355evm_rtc_exit(void) -{ - platform_driver_unregister(&rtc_dm355evm_driver); -} -module_exit(dm355evm_rtc_exit); +module_platform_driver(rtc_dm355evm_driver); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c index 68e6caf..990c3ff 100644 --- a/drivers/rtc/rtc-ds1286.c +++ b/drivers/rtc/rtc-ds1286.c @@ -396,21 +396,10 @@ static struct platform_driver ds1286_platform_driver = { .remove = __devexit_p(ds1286_remove), }; -static int __init ds1286_init(void) -{ - return platform_driver_register(&ds1286_platform_driver); -} - -static void __exit ds1286_exit(void) -{ - platform_driver_unregister(&ds1286_platform_driver); -} +module_platform_driver(ds1286_platform_driver); MODULE_AUTHOR("Thomas Bogendoerfer "); MODULE_DESCRIPTION("DS1286 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-ds1286"); - -module_init(ds1286_init); -module_exit(ds1286_exit); diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 586c244..761f36b 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -580,20 +580,7 @@ static struct platform_driver ds1511_rtc_driver = { }, }; - static int __init -ds1511_rtc_init(void) -{ - return platform_driver_register(&ds1511_rtc_driver); -} - - static void __exit -ds1511_rtc_exit(void) -{ - platform_driver_unregister(&ds1511_rtc_driver); -} - -module_init(ds1511_rtc_init); -module_exit(ds1511_rtc_exit); +module_platform_driver(ds1511_rtc_driver); MODULE_AUTHOR("Andrew Sharp "); MODULE_DESCRIPTION("Dallas DS1511 RTC driver"); diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 1350029..6f0a1b5 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -361,18 +361,7 @@ static struct platform_driver ds1553_rtc_driver = { }, }; -static __init int ds1553_init(void) -{ - return platform_driver_register(&ds1553_rtc_driver); -} - -static __exit void ds1553_exit(void) -{ - platform_driver_unregister(&ds1553_rtc_driver); -} - -module_init(ds1553_init); -module_exit(ds1553_exit); +module_platform_driver(ds1553_rtc_driver); MODULE_AUTHOR("Atsushi Nemoto "); MODULE_DESCRIPTION("Dallas DS1553 RTC driver"); diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index e3e0f92..7611266 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -240,18 +240,7 @@ static struct platform_driver ds1742_rtc_driver = { }, }; -static __init int ds1742_init(void) -{ - return platform_driver_register(&ds1742_rtc_driver); -} - -static __exit void ds1742_exit(void) -{ - platform_driver_unregister(&ds1742_rtc_driver); -} - -module_init(ds1742_init); -module_exit(ds1742_exit); +module_platform_driver(ds1742_rtc_driver); MODULE_AUTHOR("Atsushi Nemoto "); MODULE_DESCRIPTION("Dallas DS1742 RTC driver"); diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index b647363..1481e36 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -355,17 +355,7 @@ struct platform_driver jz4740_rtc_driver = { }, }; -static int __init jz4740_rtc_init(void) -{ - return platform_driver_register(&jz4740_rtc_driver); -} -module_init(jz4740_rtc_init); - -static void __exit jz4740_rtc_exit(void) -{ - platform_driver_unregister(&jz4740_rtc_driver); -} -module_exit(jz4740_rtc_exit); +module_platform_driver(jz4740_rtc_driver); MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index ae16250..ecc1713 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -396,17 +396,7 @@ static struct platform_driver lpc32xx_rtc_driver = { }, }; -static int __init lpc32xx_rtc_init(void) -{ - return platform_driver_register(&lpc32xx_rtc_driver); -} -module_init(lpc32xx_rtc_init); - -static void __exit lpc32xx_rtc_exit(void) -{ - platform_driver_unregister(&lpc32xx_rtc_driver); -} -module_exit(lpc32xx_rtc_exit); +module_platform_driver(lpc32xx_rtc_driver); MODULE_AUTHOR("Kevin Wells "); MODULE_DESCRIPTION("M48T35 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-m48t35"); - -module_init(m48t35_init); -module_exit(m48t35_exit); diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 2836538..30ebfec 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -530,18 +530,7 @@ static struct platform_driver m48t59_rtc_driver = { .remove = __devexit_p(m48t59_rtc_remove), }; -static int __init m48t59_rtc_init(void) -{ - return platform_driver_register(&m48t59_rtc_driver); -} - -static void __exit m48t59_rtc_exit(void) -{ - platform_driver_unregister(&m48t59_rtc_driver); -} - -module_init(m48t59_rtc_init); -module_exit(m48t59_rtc_exit); +module_platform_driver(m48t59_rtc_driver); MODULE_AUTHOR("Mark Zhan "); MODULE_DESCRIPTION("M48T59/M48T02/M48T08 RTC driver"); diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index f981287..863fb33 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -185,21 +185,10 @@ static struct platform_driver m48t86_rtc_platform_driver = { .remove = __devexit_p(m48t86_rtc_remove), }; -static int __init m48t86_rtc_init(void) -{ - return platform_driver_register(&m48t86_rtc_platform_driver); -} - -static void __exit m48t86_rtc_exit(void) -{ - platform_driver_unregister(&m48t86_rtc_platform_driver); -} +module_platform_driver(m48t86_rtc_platform_driver); MODULE_AUTHOR("Alessandro Zummo "); MODULE_DESCRIPTION("M48T86 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-m48t86"); - -module_init(m48t86_rtc_init); -module_exit(m48t86_rtc_exit); diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 3bc046f..4a55293 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -299,17 +299,7 @@ static struct platform_driver max8925_rtc_driver = { .remove = __devexit_p(max8925_rtc_remove), }; -static int __init max8925_rtc_init(void) -{ - return platform_driver_register(&max8925_rtc_driver); -} -module_init(max8925_rtc_init); - -static void __exit max8925_rtc_exit(void) -{ - platform_driver_unregister(&max8925_rtc_driver); -} -module_exit(max8925_rtc_exit); +module_platform_driver(max8925_rtc_driver); MODULE_DESCRIPTION("Maxim MAX8925 RTC driver"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index 2e48aa6..7196f43 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -327,17 +327,7 @@ static struct platform_driver max8998_rtc_driver = { .id_table = max8998_rtc_id, }; -static int __init max8998_rtc_init(void) -{ - return platform_driver_register(&max8998_rtc_driver); -} -module_init(max8998_rtc_init); - -static void __exit max8998_rtc_exit(void) -{ - platform_driver_unregister(&max8998_rtc_driver); -} -module_exit(max8998_rtc_exit); +module_platform_driver(max8998_rtc_driver); MODULE_AUTHOR("Minkyu Kang "); MODULE_AUTHOR("Joonyoung Shim "); diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index da60915..9d3cacc 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -418,17 +418,7 @@ static struct platform_driver mpc5121_rtc_driver = { .remove = __devexit_p(mpc5121_rtc_remove), }; -static int __init mpc5121_rtc_init(void) -{ - return platform_driver_register(&mpc5121_rtc_driver); -} -module_init(mpc5121_rtc_init); - -static void __exit mpc5121_rtc_exit(void) -{ - platform_driver_unregister(&mpc5121_rtc_driver); -} -module_exit(mpc5121_rtc_exit); +module_platform_driver(mpc5121_rtc_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("John Rigby "); diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index bb21f44..6cd6c72 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -537,18 +537,7 @@ static struct platform_driver vrtc_mrst_platform_driver = { } }; -static int __init vrtc_mrst_init(void) -{ - return platform_driver_register(&vrtc_mrst_platform_driver); -} - -static void __exit vrtc_mrst_exit(void) -{ - platform_driver_unregister(&vrtc_mrst_platform_driver); -} - -module_init(vrtc_mrst_init); -module_exit(vrtc_mrst_exit); +module_platform_driver(vrtc_mrst_platform_driver); MODULE_AUTHOR("Jacob Pan; Feng Tang"); MODULE_DESCRIPTION("Driver for Moorestown virtual RTC"); diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c index 0c42389..a20202f 100644 --- a/drivers/rtc/rtc-pcf50633.c +++ b/drivers/rtc/rtc-pcf50633.c @@ -294,17 +294,7 @@ static struct platform_driver pcf50633_rtc_driver = { .remove = __devexit_p(pcf50633_rtc_remove), }; -static int __init pcf50633_rtc_init(void) -{ - return platform_driver_register(&pcf50633_rtc_driver); -} -module_init(pcf50633_rtc_init); - -static void __exit pcf50633_rtc_exit(void) -{ - platform_driver_unregister(&pcf50633_rtc_driver); -} -module_exit(pcf50633_rtc_exit); +module_platform_driver(pcf50633_rtc_driver); MODULE_DESCRIPTION("PCF50633 RTC driver"); MODULE_AUTHOR("Balaji Rao "); diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index d420e9d..9f1d6bc 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -532,17 +532,7 @@ static struct platform_driver pm8xxx_rtc_driver = { }, }; -static int __init pm8xxx_rtc_init(void) -{ - return platform_driver_register(&pm8xxx_rtc_driver); -} -module_init(pm8xxx_rtc_init); - -static void __exit pm8xxx_rtc_exit(void) -{ - platform_driver_unregister(&pm8xxx_rtc_driver); -} -module_exit(pm8xxx_rtc_exit); +module_platform_driver(pm8xxx_rtc_driver); MODULE_ALIAS("platform:rtc-pm8xxx"); MODULE_DESCRIPTION("PMIC8xxx RTC driver"); diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 175067a..aef40bd 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -673,21 +673,7 @@ static struct platform_driver s3c_rtc_driver = { }, }; -static char __initdata banner[] = "S3C24XX RTC, (c) 2004,2006 Simtec Electronics\n"; - -static int __init s3c_rtc_init(void) -{ - printk(banner); - return platform_driver_register(&s3c_rtc_driver); -} - -static void __exit s3c_rtc_exit(void) -{ - platform_driver_unregister(&s3c_rtc_driver); -} - -module_init(s3c_rtc_init); -module_exit(s3c_rtc_exit); +module_platform_driver(s3c_rtc_driver); MODULE_DESCRIPTION("Samsung S3C RTC Driver"); MODULE_AUTHOR("Ben Dooks "); diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index fc1ffe9..4595d3e 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -435,18 +435,7 @@ static struct platform_driver sa1100_rtc_driver = { }, }; -static int __init sa1100_rtc_init(void) -{ - return platform_driver_register(&sa1100_rtc_driver); -} - -static void __exit sa1100_rtc_exit(void) -{ - platform_driver_unregister(&sa1100_rtc_driver); -} - -module_init(sa1100_rtc_init); -module_exit(sa1100_rtc_exit); +module_platform_driver(sa1100_rtc_driver); MODULE_AUTHOR("Richard Purdie "); MODULE_DESCRIPTION("SA11x0/PXA2xx Realtime Clock Driver (RTC)"); diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 893bac2..19a28a6 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -516,17 +516,7 @@ static struct platform_driver spear_rtc_driver = { }, }; -static int __init rtc_init(void) -{ - return platform_driver_register(&spear_rtc_driver); -} -module_init(rtc_init); - -static void __exit rtc_exit(void) -{ - platform_driver_unregister(&spear_rtc_driver); -} -module_exit(rtc_exit); +module_platform_driver(spear_rtc_driver); MODULE_ALIAS("platform:rtc-spear"); MODULE_AUTHOR("Rajeev Kumar "); diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index ed3e9b5..7621116 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -370,18 +370,7 @@ static struct platform_driver stk17ta8_rtc_driver = { }, }; -static __init int stk17ta8_init(void) -{ - return platform_driver_register(&stk17ta8_rtc_driver); -} - -static __exit void stk17ta8_exit(void) -{ - platform_driver_unregister(&stk17ta8_rtc_driver); -} - -module_init(stk17ta8_init); -module_exit(stk17ta8_exit); +module_platform_driver(stk17ta8_rtc_driver); MODULE_AUTHOR("Thomas Hommel "); MODULE_DESCRIPTION("Simtek STK17TA8 RTC driver"); diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 7315068..1028786 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -276,18 +276,7 @@ static struct platform_driver stmp3xxx_rtcdrv = { }, }; -static int __init stmp3xxx_rtc_init(void) -{ - return platform_driver_register(&stmp3xxx_rtcdrv); -} - -static void __exit stmp3xxx_rtc_exit(void) -{ - platform_driver_unregister(&stmp3xxx_rtcdrv); -} - -module_init(stmp3xxx_rtc_init); -module_exit(stmp3xxx_rtc_exit); +module_platform_driver(stmp3xxx_rtcdrv); MODULE_DESCRIPTION("STMP3xxx RTC Driver"); MODULE_AUTHOR("dmitry pervushin and " diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index f71c3ce..bca5d67 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -393,18 +393,7 @@ static struct platform_driver rtc_device_driver = { }, }; -static __init int v3020_init(void) -{ - return platform_driver_register(&rtc_device_driver); -} - -static __exit void v3020_exit(void) -{ - platform_driver_unregister(&rtc_device_driver); -} - -module_init(v3020_init); -module_exit(v3020_exit); +module_platform_driver(rtc_device_driver); MODULE_DESCRIPTION("V3020 RTC"); MODULE_AUTHOR("Raphael Assenat"); diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index c5698cd..fcbfdda 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -405,15 +405,4 @@ static struct platform_driver rtc_platform_driver = { }, }; -static int __init vr41xx_rtc_init(void) -{ - return platform_driver_register(&rtc_platform_driver); -} - -static void __exit vr41xx_rtc_exit(void) -{ - platform_driver_unregister(&rtc_platform_driver); -} - -module_init(vr41xx_rtc_init); -module_exit(vr41xx_rtc_exit); +module_platform_driver(rtc_platform_driver); diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index f93f412..9e94fb1 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -311,17 +311,7 @@ static struct platform_driver vt8500_rtc_driver = { }, }; -static int __init vt8500_rtc_init(void) -{ - return platform_driver_register(&vt8500_rtc_driver); -} -module_init(vt8500_rtc_init); - -static void __exit vt8500_rtc_exit(void) -{ - platform_driver_unregister(&vt8500_rtc_driver); -} -module_exit(vt8500_rtc_exit); +module_platform_driver(vt8500_rtc_driver); MODULE_AUTHOR("Alexey Charkov "); MODULE_DESCRIPTION("VIA VT8500 SoC Realtime Clock Driver (RTC)"); diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index 657c6f6..3b6e6a6 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -468,17 +468,7 @@ static struct platform_driver wm831x_rtc_driver = { }, }; -static int __init wm831x_rtc_init(void) -{ - return platform_driver_register(&wm831x_rtc_driver); -} -module_init(wm831x_rtc_init); - -static void __exit wm831x_rtc_exit(void) -{ - platform_driver_unregister(&wm831x_rtc_driver); -} -module_exit(wm831x_rtc_exit); +module_platform_driver(wm831x_rtc_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("RTC driver for the WM831x series PMICs"); diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 6642142..c2e52d1 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -486,17 +486,7 @@ static struct platform_driver wm8350_rtc_driver = { }, }; -static int __init wm8350_rtc_init(void) -{ - return platform_driver_register(&wm8350_rtc_driver); -} -module_init(wm8350_rtc_init); - -static void __exit wm8350_rtc_exit(void) -{ - platform_driver_unregister(&wm8350_rtc_driver); -} -module_exit(wm8350_rtc_exit); +module_platform_driver(wm8350_rtc_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("RTC driver for the WM8350"); -- cgit v0.10.2 From a46481d7af1e6c59c03f3ddac400d9054f804952 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:52 -0800 Subject: drivers/rtc/rtc-mc13xxx.c: make mc13xxx_rtc_idtable static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Axel Lin Acked-by: Uwe Kleine-König Cc: Alessandro Zummo Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 9d0c3b4..546f685 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -399,7 +399,7 @@ static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) return 0; } -const struct platform_device_id mc13xxx_rtc_idtable[] = { +static const struct platform_device_id mc13xxx_rtc_idtable[] = { { .name = "mc13783-rtc", }, { -- cgit v0.10.2 From 681d0378a9057a92b9e6e51c2112e53d920a092d Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:55 -0800 Subject: drivers/rtc/rtc-jz4740.c: make jz4740_rtc_driver static Signed-off-by: Axel Lin Cc: Lars-Peter Clausen Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 1481e36..05ab227 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -345,7 +345,7 @@ static const struct dev_pm_ops jz4740_pm_ops = { #define JZ4740_RTC_PM_OPS NULL #endif /* CONFIG_PM */ -struct platform_driver jz4740_rtc_driver = { +static struct platform_driver jz4740_rtc_driver = { .probe = jz4740_rtc_probe, .remove = __devexit_p(jz4740_rtc_remove), .driver = { -- cgit v0.10.2 From 6c3fb55793f79bc975df0494c4d56ea6f0b0cc45 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 10 Jan 2012 15:10:58 -0800 Subject: drivers/rtc/: remove redundant spi driver bus initialization In ancient times it was necessary to manually initialize the bus field of an spi_driver to spi_bus_type. These days this is done in spi_driver_register(), so we can drop the manual assignment. The patch was generated using the following coccinelle semantic patch: // @@ identifier _driver; @@ struct spi_driver _driver = { .driver = { - .bus = &spi_bus_type, }, }; // Signed-off-by: Lars-Peter Clausen Cc: John Stultz Cc: Alessandro Zummo Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-m41t93.c b/drivers/rtc/rtc-m41t93.c index 7317d3b..ef71132 100644 --- a/drivers/rtc/rtc-m41t93.c +++ b/drivers/rtc/rtc-m41t93.c @@ -200,7 +200,6 @@ static int __devexit m41t93_remove(struct spi_device *spi) static struct spi_driver m41t93_driver = { .driver = { .name = "rtc-m41t93", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = m41t93_probe, diff --git a/drivers/rtc/rtc-m41t94.c b/drivers/rtc/rtc-m41t94.c index e259ed7..2a4721f 100644 --- a/drivers/rtc/rtc-m41t94.c +++ b/drivers/rtc/rtc-m41t94.c @@ -147,7 +147,6 @@ static int __devexit m41t94_remove(struct spi_device *spi) static struct spi_driver m41t94_driver = { .driver = { .name = "rtc-m41t94", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = m41t94_probe, diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 0ec3f58..1f6b3cc 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -154,7 +154,6 @@ static int __devexit max6902_remove(struct spi_device *spi) static struct spi_driver max6902_driver = { .driver = { .name = "rtc-max6902", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = max6902_probe, diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c index 2ee3bbf..b46c400 100644 --- a/drivers/rtc/rtc-pcf2123.c +++ b/drivers/rtc/rtc-pcf2123.c @@ -340,7 +340,6 @@ static int __devexit pcf2123_remove(struct spi_device *spi) static struct spi_driver pcf2123_driver = { .driver = { .name = "rtc-pcf2123", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = pcf2123_probe, diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c index 971bc8e..ce2ca85 100644 --- a/drivers/rtc/rtc-rs5c348.c +++ b/drivers/rtc/rtc-rs5c348.c @@ -229,7 +229,6 @@ static int __devexit rs5c348_remove(struct spi_device *spi) static struct spi_driver rs5c348_driver = { .driver = { .name = "rtc-rs5c348", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = rs5c348_probe, -- cgit v0.10.2 From 948170f8944dfd29d13612fff48110a9814daeb1 Mon Sep 17 00:00:00 2001 From: Benoit Cousson Date: Tue, 10 Jan 2012 15:10:59 -0800 Subject: drivers/rtc/rtc-twl.c: add DT support for RTC inside twl4030/twl6030 Add the DT support for the TI rtc-twl present in the twl4030 and twl6030 devices. Signed-off-by: Benoit Cousson Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/devicetree/bindings/rtc/twl-rtc.txt b/Documentation/devicetree/bindings/rtc/twl-rtc.txt new file mode 100644 index 0000000..596e0c9 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/twl-rtc.txt @@ -0,0 +1,12 @@ +* TI twl RTC + +The TWL family (twl4030/6030) contains a RTC. + +Required properties: +- compatible : Should be twl4030-rtc + +Examples: + +rtc@0 { + compatible = "ti,twl4030-rtc"; +}; diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 20687d5..d43b4f6 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -550,6 +550,11 @@ static int twl_rtc_resume(struct platform_device *pdev) #define twl_rtc_resume NULL #endif +static const struct of_device_id twl_rtc_of_match[] = { + {.compatible = "ti,twl4030-rtc", }, + { }, +}; +MODULE_DEVICE_TABLE(of, twl_rtc_of_match); MODULE_ALIAS("platform:twl_rtc"); static struct platform_driver twl4030rtc_driver = { @@ -559,8 +564,9 @@ static struct platform_driver twl4030rtc_driver = { .suspend = twl_rtc_suspend, .resume = twl_rtc_resume, .driver = { - .owner = THIS_MODULE, - .name = "twl_rtc", + .owner = THIS_MODULE, + .name = "twl_rtc", + .of_match_table = twl_rtc_of_match, }, }; -- cgit v0.10.2 From e74a8f2edb92cb690b467cea0ab652c509e9f624 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jan 2012 15:11:02 -0800 Subject: drivers/rtc/interface.c: fix alarm rollover when day or month is out-of-range Commit f44f7f96a20a ("RTC: Initialize kernel state from RTC") introduced a potential infinite loop. If an alarm time contains a wildcard month and an invalid day (> 31), or a wildcard year and an invalid month (>= 12), the loop searching for the next matching date will never terminate. Treat the invalid values as wildcards. Fixes , Reported-by: leo weppelman Reported-by: "P. van Gaans" Signed-off-by: Ben Hutchings Signed-off-by: Jonathan Nieder Cc: Mark Brown Cc: Marcelo Roberto Jimenez Cc: Thomas Gleixner Cc: John Stultz Acked-by: Alessandro Zummo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 8e28625..8a1c031 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -228,11 +228,11 @@ int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) alarm->time.tm_hour = now.tm_hour; /* For simplicity, only support date rollover for now */ - if (alarm->time.tm_mday == -1) { + if (alarm->time.tm_mday < 1 || alarm->time.tm_mday > 31) { alarm->time.tm_mday = now.tm_mday; missing = day; } - if (alarm->time.tm_mon == -1) { + if ((unsigned)alarm->time.tm_mon >= 12) { alarm->time.tm_mon = now.tm_mon; if (missing == none) missing = month; -- cgit v0.10.2 From b18c1c6e0c90cbcd38ba879bd63a44c94e4f7301 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 10 Jan 2012 15:11:05 -0800 Subject: reiserfs: delete comments referring to the BKL Signed-off-by: Davidlohr Bueso Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index eb71106..cce8e87 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2896,14 +2896,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } - /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; th->t_blocks_allocated += new_alloc ; return 0; } -/* this must be called inside a transaction, and requires the -** kernel_lock to be held +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { @@ -2914,8 +2913,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { @@ -2924,8 +2922,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { -- cgit v0.10.2 From f32485be8397ad811312bc055d2e2a5906bc7576 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:07 -0800 Subject: reiserfs: delay reiserfs lock until journal initialization In the mount path, transactions that are made before journal initialization don't involve the filesystem. We can delay the reiserfs lock until we play with the journal. Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index a945cd2..70de42f 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); - /* Avoid lock recursion in fault case */ - reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); - reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1d42e70..620dd5d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1746,22 +1746,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&REISERFS_SB(s)->lock); REISERFS_SB(s)->lock_depth = -1; - /* - * This function is called with the bkl, which also was the old - * locking used here. - * do_journal_begin() will soon check if we hold the lock (ie: was the - * bkl). This is likely because do_journal_begin() has several another - * callers because at this time, it doesn't seem to be necessary to - * protect against anything. - * Anyway, let's be conservative and lock for now. - */ - reiserfs_write_lock(s); - jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { - goto error; + goto error_unlocked; } if (jdev_name && jdev_name[0]) { REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); @@ -1777,7 +1766,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (blocks) { SWARN(silent, s, "jmacd-7", "resize option for remount only"); - goto error; + goto error_unlocked; } /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ @@ -1787,7 +1776,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", reiserfs_bdevname(s)); - goto error; + goto error_unlocked; } rs = SB_DISK_SUPER_BLOCK(s); @@ -1803,7 +1792,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "or increase size of your LVM partition"); SWARN(silent, s, "", "Or may be you forgot to " "reboot after fdisk when it told you to"); - goto error; + goto error_unlocked; } sbi->s_mount_state = SB_REISERFS_STATE(s); @@ -1811,8 +1800,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if ((errval = reiserfs_init_bitmap_cache(s))) { SWARN(silent, s, "jmacd-8", "unable to read bitmap"); - goto error; + goto error_unlocked; } + errval = -EINVAL; #ifdef CONFIG_REISERFS_CHECK SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); @@ -1835,6 +1825,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (reiserfs_barrier_flush(s)) { printk("reiserfs: using flush barriers\n"); } + + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", @@ -1995,12 +1996,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); error: - if (jinit_done) { /* kill the commit thread, free journal ram */ + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); journal_release_error(NULL, s); + reiserfs_write_unlock(s); } - reiserfs_write_unlock(s); - reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); -- cgit v0.10.2 From 37c69b98d0dca54d9eb72226bbf2e211aaaf126e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:09 -0800 Subject: reiserfs: don't lock journal_init() journal_init() doesn't need the lock since no operation on the filesystem is involved there. journal_read() and get_list_bitmap() have yet to be reviewed carefully though before removing the lock there. Just keep the it around these two calls for safety. Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index cce8e87..c3cf54f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, char b[BDEVNAME_SIZE]; int ret; - /* - * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS - * dependency inversion warnings. - */ - reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); - reiserfs_write_lock(sb); return 1; } INIT_LIST_HEAD(&journal->j_bitmap_nodes); @@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, - reiserfs_bmap_count(sb)); - reiserfs_write_lock(sb); - if (ret) + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) goto free_and_return; allocate_bitmap_nodes(sb); @@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * We need to unlock here to avoid creating the following - * dependency: - * reiserfs_lock -> sysfs_mutex - * Because the reiserfs mmap path creates the following dependency: - * mm->mmap -> reiserfs_lock, hence we have - * mm->mmap -> reiserfs_lock ->sysfs_mutex - * This would ends up in a circular dependency with sysfs readdir path - * which does sysfs_mutex -> mm->mmap_sem - * This is fine because the reiserfs lock is useless in mount path, - * at least until we call journal_begin. We keep it for paranoid - * reasons. - */ - reiserfs_write_unlock(sb); if (journal_init_dev(sb, journal, j_dev_name) != 0) { - reiserfs_write_lock(sb); reiserfs_warning(sb, "sh-462", "unable to initialize jornal device"); goto free_and_return; } - reiserfs_write_lock(sb); rs = SB_DISK_SUPER_BLOCK(sb); @@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); - reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); - reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; @@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name, init_journal_hash(sb); jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); if (!jl->j_list_bitmap) { reiserfs_warning(sb, "journal-2005", "get_list_bitmap failed for journal list 0"); goto free_and_return; } - if (journal_read(sb) < 0) { + + /* + * Journal_read needs to be inspected in order to push down + * the lock further inside (or even remove it). + */ + reiserfs_write_lock(sb); + ret = journal_read(sb); + reiserfs_write_unlock(sb); + if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); goto free_and_return; } reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) { - reiserfs_write_unlock(sb); + if (reiserfs_mounted_fs_count <= 1) commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - reiserfs_write_lock(sb); - } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 620dd5d..61b6038 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1826,6 +1826,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) printk("reiserfs: using flush barriers\n"); } + // set_device_ro(s->s_dev, 1) ; + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, "sh-2022", + "unable to initialize journal space"); + goto error_unlocked; + } else { + jinit_done = 1; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + /* * This path assumed to be called with the BKL in the old times. * Now we have inherited the big reiserfs lock from it and many @@ -1836,16 +1847,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) */ reiserfs_write_lock(s); - // set_device_ro(s->s_dev, 1) ; - if (journal_init(s, jdev_name, old_format, commit_max_age)) { - SWARN(silent, s, "sh-2022", - "unable to initialize journal space"); - goto error; - } else { - jinit_done = 1; /* once this is set, journal_release must be called - ** if we error out of the mount - */ - } if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); -- cgit v0.10.2 From 9b467e6ebebbe75288aeb7e816ffbb5d35d6eaa3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:11 -0800 Subject: reiserfs: don't lock root inode searching Nothing requires that we lock the filesystem until the root inode is provided. Also iget5_locked() triggers a warning because we are holding the filesystem lock while allocating the inode, which result in a lockdep suspicion that we have a lock inversion against the reclaim path: [ 1986.896979] ================================= [ 1986.896990] [ INFO: inconsistent lock state ] [ 1986.896997] 3.1.1-main #8 [ 1986.897001] --------------------------------- [ 1986.897007] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. [ 1986.897016] kswapd0/16 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 1986.897023] (&REISERFS_SB(s)->lock){+.+.?.}, at: [] reiserfs_write_lock+0x20/0x2a [ 1986.897044] {RECLAIM_FS-ON-W} state was registered at: [ 1986.897050] [] mark_held_locks+0xae/0xd0 [ 1986.897060] [] lockdep_trace_alloc+0x7d/0x91 [ 1986.897068] [] kmem_cache_alloc+0x1a/0x93 [ 1986.897078] [] reiserfs_alloc_inode+0x13/0x3d [ 1986.897088] [] alloc_inode+0x14/0x5f [ 1986.897097] [] iget5_locked+0x62/0x13a [ 1986.897106] [] reiserfs_fill_super+0x410/0x8b9 [ 1986.897114] [] mount_bdev+0x10b/0x159 [ 1986.897123] [] get_super_block+0x10/0x12 [ 1986.897131] [] mount_fs+0x59/0x12d [ 1986.897138] [] vfs_kern_mount+0x45/0x7a [ 1986.897147] [] do_kern_mount+0x2f/0xb0 [ 1986.897155] [] do_mount+0x5c2/0x612 [ 1986.897163] [] sys_mount+0x61/0x8f [ 1986.897170] [] sysenter_do_call+0x12/0x32 [ 1986.897181] irq event stamp: 7509691 [ 1986.897186] hardirqs last enabled at (7509691): [] kmem_cache_alloc+0x6e/0x93 [ 1986.897197] hardirqs last disabled at (7509690): [] kmem_cache_alloc+0x24/0x93 [ 1986.897209] softirqs last enabled at (7508896): [] __do_softirq+0xee/0xfd [ 1986.897222] softirqs last disabled at (7508859): [] do_softirq+0x50/0x9d [ 1986.897234] [ 1986.897235] other info that might help us debug this: [ 1986.897242] Possible unsafe locking scenario: [ 1986.897244] [ 1986.897250] CPU0 [ 1986.897254] ---- [ 1986.897257] lock(&REISERFS_SB(s)->lock); [ 1986.897265] [ 1986.897269] lock(&REISERFS_SB(s)->lock); [ 1986.897276] [ 1986.897277] *** DEADLOCK *** [ 1986.897278] [ 1986.897286] no locks held by kswapd0/16. [ 1986.897291] [ 1986.897292] stack backtrace: [ 1986.897299] Pid: 16, comm: kswapd0 Not tainted 3.1.1-main #8 [ 1986.897306] Call Trace: [ 1986.897314] [] ? printk+0xf/0x11 [ 1986.897324] [] print_usage_bug+0x20e/0x21a [ 1986.897332] [] ? print_irq_inversion_bug+0x172/0x172 [ 1986.897341] [] mark_lock+0x27f/0x483 [ 1986.897349] [] __lock_acquire+0x628/0x1472 [ 1986.897358] [] lock_acquire+0x47/0x5e [ 1986.897366] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897384] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897397] [] mutex_lock_nested+0x35/0x26f [ 1986.897409] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897421] [] reiserfs_write_lock+0x20/0x2a [ 1986.897433] [] map_block_for_writepage+0xc9/0x590 [ 1986.897448] [] ? create_empty_buffers+0x33/0x8f [ 1986.897461] [] ? get_parent_ip+0xb/0x31 [ 1986.897472] [] ? sub_preempt_count+0x81/0x8e [ 1986.897485] [] ? _raw_spin_unlock+0x27/0x3d [ 1986.897496] [] ? get_parent_ip+0xb/0x31 [ 1986.897508] [] reiserfs_writepage+0x1b9/0x3e7 [ 1986.897521] [] ? clear_page_dirty_for_io+0xcb/0xde [ 1986.897533] [] ? trace_hardirqs_on_caller+0x108/0x138 [ 1986.897546] [] ? trace_hardirqs_on+0xb/0xd [ 1986.897559] [] shrink_page_list+0x34f/0x5e2 [ 1986.897572] [] shrink_inactive_list+0x172/0x22c [ 1986.897585] [] shrink_zone+0x303/0x3b1 [ 1986.897597] [] ? _raw_spin_unlock+0x27/0x3d [ 1986.897611] [] kswapd+0x3b7/0x5f2 The deadlock shouldn't happen since we are doing that allocation in the mount path, the filesystem is not available for any reclaim. Still the warning is annoying. To solve this, acquire the lock later only where we need it, right before calling reiserfs_read_locked_inode() that wants to lock to walk the tree. Reported-by: Knut Petersen Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 61b6038..e12d8b9 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); - reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); - reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1837,24 +1835,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) */ } - /* - * This path assumed to be called with the BKL in the old times. - * Now we have inherited the big reiserfs lock from it and many - * reiserfs helpers called in the mount path and elsewhere require - * this lock to be held even if it's not always necessary. Let's be - * conservative and hold it early. The window can be reduced after - * careful review of the code. - */ - reiserfs_write_lock(s); - if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); - goto error; + goto error_unlocked; } if (replay_only(s)) - goto error; + goto error_unlocked; if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { SWARN(silent, s, "clm-7000", @@ -1868,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); - goto error; + goto error_unlocked; } + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + if (root_inode->i_state & I_NEW) { reiserfs_read_locked_inode(root_inode, &args); unlock_new_inode(root_inode); -- cgit v0.10.2 From f350b1778f1b7713ef54fbc7e079e09e2fe098b9 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:14 -0800 Subject: sparc: make SA_NOMASK a synonym of SA_NODEFER Unlike other architectures, sparc currently has no SA_NODEFER definition but only the older SA_NOMASK. Since SA_NOMASK is the historical name for SA_NODEFER, add SA_NODEFER and copy what other architectures do by making SA_NOMASK a synonym for SA_NODEFER. Signed-off-by: Matt Fleming Acked-by: Oleg Nesterov Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sparc/include/asm/signal.h b/arch/sparc/include/asm/signal.h index e49b828..aa42fe3 100644 --- a/arch/sparc/include/asm/signal.h +++ b/arch/sparc/include/asm/signal.h @@ -143,10 +143,11 @@ struct sigstack { #define SA_ONSTACK _SV_SSTACK #define SA_RESTART _SV_INTR #define SA_ONESHOT _SV_RESET -#define SA_NOMASK 0x20u +#define SA_NODEFER 0x20u #define SA_NOCLDWAIT 0x100u #define SA_SIGINFO 0x200u +#define SA_NOMASK SA_NODEFER #define SIG_BLOCK 0x01 /* for blocking signals */ #define SIG_UNBLOCK 0x02 /* for unblocking signals */ -- cgit v0.10.2 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb2..46a01bdc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/include/linux/signal.h b/include/linux/signal.h index a822300..7987ce74 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern void block_sigmask(struct k_sigaction *ka, int signr); extern void exit_signals(struct task_struct *tsk); extern struct kmem_cache *sighand_cachep; diff --git a/kernel/signal.c b/kernel/signal.c index bb0efa5..d532f17 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2318,6 +2318,27 @@ relock: return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take -- cgit v0.10.2 From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Jan 2012 15:11:20 -0800 Subject: procfs: make proc_get_link to use dentry instead of inode Prepare the ground for the next "map_files" patch which needs a name of a link file to analyse. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Tejun Heo Cc: Vasiliy Kulikov Cc: "Kirill A. Shutemov" Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index 1aab5fe..e31d950 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -166,9 +166,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -183,9 +183,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -1456,13 +1456,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1492,7 +1492,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); out: return ERR_PTR(error); } @@ -1531,7 +1531,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1823,9 +1823,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) return -ENOENT; } -static int proc_fd_link(struct inode *inode, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(inode, path, NULL); + return proc_fd_info(dentry->d_inode, path, NULL); } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 6d9e575..85c5073 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; union proc_op { - int (*proc_get_link)(struct inode *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *); int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, -- cgit v0.10.2 From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 10 Jan 2012 15:11:23 -0800 Subject: procfs: introduce the /proc//map_files/ directory This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc//map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vasiliy Kulikov Reviewed-by: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Alexey Dobriyan Cc: Al Viro Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index e31d950..4d755fe 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,7 @@ #include #include #include +#include #ifdef CONFIG_HARDWALL #include #endif @@ -134,6 +135,8 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) +static int proc_fd_permission(struct inode *inode, int mask); + /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_unlock; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (lock_trace(task)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_unlock; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). @@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET diff --git a/include/linux/mm.h b/include/linux/mm.h index 5568553..6eba2cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +/* Look up the first VMA which exactly match the interval vm_start ... vm_end */ +static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, + unsigned long vm_start, unsigned long vm_end) +{ + struct vm_area_struct *vma = find_vma(mm, vm_start); + + if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) + vma = NULL; + + return vma; +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); #else -- cgit v0.10.2 From 97412950b10e64f347aec4a9b759395c2465adf6 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:27 -0800 Subject: procfs: parse mount options Add support for procfs mount options. Actual mount options are coming in the next patches. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 51a1766..27c762f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,9 @@ #include #include #include +#include #include +#include #include #include @@ -101,12 +104,19 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; static void __pde_users_dec(struct proc_dir_entry *pde) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7838e5c..2925775 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde); int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/root.c b/fs/proc/root.c index 03102d9..6a8ac1d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" @@ -36,6 +37,48 @@ static int proc_set_super(struct super_block *sb, void *data) return err; } +enum { + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + pr_debug("proc: options = %s\n", options); + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -43,11 +86,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else + options = NULL; + } else { ns = current->nsproxy->pid_ns; + options = data; + } sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) @@ -55,6 +102,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (!sb->s_root) { sb->s_flags = flags; + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); -- cgit v0.10.2 From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:31 -0800 Subject: procfs: add hidepid= and gid= mount options Add support for mount options to restrict access to /proc/PID/ directories. The default backward-compatible "relaxed" behaviour is left untouched. The first mount option is called "hidepid" and its value defines how much info about processes we want to be available for non-owners: hidepid=0 (default) means the old behavior - anybody may read all world-readable /proc/PID/* files. hidepid=1 means users may not access any /proc// directories, but their own. Sensitive files like cmdline, sched*, status are now protected against other users. As permission checking done in proc_pid_permission() and files' permissions are left untouched, programs expecting specific files' modes are not confused. hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other users. It doesn't mean that it hides whether a process exists (it can be learned by other means, e.g. by kill -0 $PID), but it hides process' euid and egid. It compicates intruder's task of gathering info about running processes, whether some daemon runs with elevated privileges, whether another user runs some sensitive program, whether other users run any program at all, etc. gid=XXX defines a group that will be able to gather all processes' info (as in hidepid=0 mode). This group should be used instead of putting nonroot user in sudoers file or something. However, untrusted users (like daemons, etc.) which are not supposed to monitor the tasks in the whole system should not be added to the group. hidepid=1 or higher is designed to restrict access to procfs files, which might reveal some sensitive private information like precise keystrokes timings: http://www.openwall.com/lists/oss-security/2011/11/05/3 hidepid=1/2 doesn't break monitoring userspace tools. ps, top, pgrep, and conky gracefully handle EPERM/ENOENT and behave as if the current user is the only user running processes. pstree shows the process subtree which contains "pstree" process. Note: the patch doesn't deal with setuid/setgid issues of keeping preopened descriptors of procfs files (like https://lkml.org/lkml/2011/2/7/368). We rely on that the leaked information like the scheduling counters of setuid apps doesn't threaten anybody's privacy - only the user started the setuid program may read the counters. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0ec91f0..12fee13 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -41,6 +41,8 @@ Table of Contents 3.5 /proc//mountinfo - Information about mounts 3.6 /proc//comm & /proc//task//comm + 4 Configuring procfs + 4.1 Mount options ------------------------------------------------------------------------------ Preface @@ -1542,3 +1544,40 @@ a task to set its own or one of its thread siblings comm value. The comm value is limited in size compared to the cmdline value, so writing anything longer then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated comm value. + + +------------------------------------------------------------------------------ +Configuring procfs +------------------------------------------------------------------------------ + +4.1 Mount options +--------------------- + +The following mount options are supported: + + hidepid= Set /proc// access mode. + gid= Set the group authorized to learn processes information. + +hidepid=0 means classic mode - everybody may access all /proc// directories +(default). + +hidepid=1 means users may not access any /proc// directories but their +own. Sensitive files like cmdline, sched*, status are now protected against +other users. This makes it impossible to learn whether any user runs +specific program (given the program doesn't reveal itself by its behaviour). +As an additional bonus, as /proc//cmdline is unaccessible for other users, +poorly written programs passing sensitive information via program arguments are +now protected against local eavesdroppers. + +hidepid=2 means hidepid=1 plus all /proc// will be fully invisible to other +users. It doesn't mean that it hides a fact whether a process with a specific +pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"), +but it hides process' uid and gid, which may be learned by stat()'ing +/proc// otherwise. It greatly complicates an intruder's task of gathering +information about running processes, whether some daemon runs with elevated +privileges, whether other user runs some sensitive program, whether other users +run any program at all, etc. + +gid= defines a group authorized to learn processes information otherwise +prohibited by hidepid=. If you use some daemon like identd which needs to learn +information about processes information, just add identd to this group. diff --git a/fs/proc/base.c b/fs/proc/base.c index 4d755fe..8173dfd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -631,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; @@ -1615,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1623,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = 0; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -3119,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -3322,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi proc_pid_instantiate, iter.task, NULL); } +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -3329,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out_no_task; @@ -3350,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { put_task_struct(iter.task); goto out; } @@ -3686,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 27c762f..84fd323 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,6 +106,14 @@ void __init proc_init_inodecache(void) static int proc_show_options(struct seq_file *seq, struct dentry *root) { + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 6a8ac1d..46a15d8 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,10 +38,12 @@ static int proc_set_super(struct super_block *sb, void *data) } enum { - Opt_err, + Opt_gid, Opt_hidepid, Opt_err, }; static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, {Opt_err, NULL}, }; @@ -49,8 +51,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; - - pr_debug("proc: options = %s\n", options); + int option; if (!options) return 1; @@ -63,6 +64,20 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; default: pr_err("proc: unrecognized mount option \"%s\" " "or missing value\n", p); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 38d1032..e7cf666 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -30,6 +30,8 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif + gid_t pid_gid; + int hide_pid; }; extern struct pid_namespace init_pid_ns; -- cgit v0.10.2 From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:11:35 -0800 Subject: workqueue: make alloc_workqueue() take printf fmt and args for name alloc_workqueue() currently expects the passed in @name pointer to remain accessible. This is inconvenient and a bit silly given that the whole wq is being dynamically allocated. This patch updates alloc_workqueue() and friends to take printf format string instead of opaque string and matching varargs at the end. The name is allocated together with the wq and formatted. alloc_ordered_workqueue() is converted to a macro to unify varargs handling with alloc_workqueue(), and, while at it, add comment to alloc_workqueue(). None of the current in-kernel users pass in string with '%' as constant name and this change shouldn't cause any problem. [akpm@linux-foundation.org: use __printf] Signed-off-by: Tejun Heo Suggested-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0d556de..eb8b9f1 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct * -__alloc_workqueue_key(const char *name, unsigned int flags, int max_active, - struct lock_class_key *key, const char *lock_name); +__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, + struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); +/** + * alloc_workqueue - allocate a workqueue + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @args: args for @fmt + * + * Allocate a workqueue with the specified parameters. For detailed + * information on WQ_* flags, please refer to Documentation/workqueue.txt. + * + * The __lock_name macro dance is to guarantee that single lock_class_key + * doesn't end up with different namesm, which isn't allowed by lockdep. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ #ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ ({ \ static struct lock_class_key __key; \ const char *__lock_name; \ \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ + if (__builtin_constant_p(fmt)) \ + __lock_name = (fmt); \ else \ - __lock_name = #name; \ + __lock_name = #fmt; \ \ - __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + &__key, __lock_name, ##args); \ }) #else -#define alloc_workqueue(name, flags, max_active) \ - __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL) +#define alloc_workqueue(fmt, flags, max_active, args...) \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + NULL, NULL, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue - * @name: name of the workqueue + * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are @@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -static inline struct workqueue_struct * -alloc_ordered_workqueue(const char *name, unsigned int flags) -{ - return alloc_workqueue(name, WQ_UNBOUND | flags, 1); -} +#define alloc_ordered_workqueue(fmt, flags, args...) \ + alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad..bec7b5b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; -- cgit v0.10.2 From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 10 Jan 2012 15:11:37 -0800 Subject: user namespace: make signal.c respect user namespaces ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's user namespace. (new, thanks Oleg) __send_signal: convert current's uid to the recipient's user namespace for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks again :) do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's user namespace ptrace_signal maps parent's uid into current's user namespace before including in signal to current. IIUC Oleg has argued that this shouldn't matter as the debugger will play with it, but it seems like not converting the value currently being set is misleading. Changelog: Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to simplify callers and help make clear what we are translating (which uid into which namespace). Passing the target task would make callers even easier to read, but we pass in user_ns because current_user_ns() != task_cred_xxx(current, user_ns). Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock in ptrace_signal(). Sep 23: In send_signal(), detect when (user) signal is coming from an ancestor or unrelated user namespace. Pass that on to __send_signal, which sets si_uid to 0 or overflowuid if needed. Oct 12: Base on Oleg's fixup_uid() patch. On top of that, handle all SI_FROMKERNEL cases at callers, because we can't assume sender is current in those cases. Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid Nov 10: (akpm) make the !CONFIG_USER_NS case clearer Signed-off-by: Serge Hallyn Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" From: Serge Hallyn Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2) Eric Biederman pointed out that passing info is a bug and could lead to a NULL pointer deref to boot. A collection of signal, securebits, filecaps, cap_bounds, and a few other ltp tests passed with this kernel. Changelog: Nov 18: previous patch missed a leading '&' Signed-off-by: Serge Hallyn Cc: "Eric W. Biederman" From: Dan Carpenter Subject: ipc/mqueue: lock() => unlock() typo There was a double lock typo introduced in b085f4bd6b21 "user namespace: make signal.c respect user namespaces" Signed-off-by: Dan Carpenter Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9a142a2..9b7c8ab 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -542,9 +543,13 @@ static void __do_notify(struct mqueue_inode_info *info) sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; + /* map current pid/uid into info->owner's namespaces */ + rcu_read_lock(); sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); - sig_i.si_uid = current_uid(); + sig_i.si_uid = user_ns_map_uid(info->user->user_ns, + current_cred(), current_uid()); + rcu_read_unlock(); kill_pid_info(info->notify.sigev_signo, &sig_i, info->notify_owner); diff --git a/kernel/signal.c b/kernel/signal.c index d532f17..c73c428 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ -- cgit v0.10.2