From 290e0e0f2b54b2eed5018f921c585bb694f9e68a Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 20 Jan 2016 14:58:06 -0800 Subject: lib/libcrc32c.c: fix build warning Fix the following build warning: lib/libcrc32c.c:42:5: warning: no previous prototype for "crc32c" [-Wmissing-prototypes] u32 crc32c(u32 crc, const void *address, unsigned int length) ^ Signed-off-by: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 6a08ce7..31ce853 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -36,6 +36,7 @@ #include #include #include +#include static struct crypto_shash *tfm; -- cgit v0.10.2 From 0b9b6fff7b4caf5838550151d15b389aaa217707 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 20 Jan 2016 14:58:09 -0800 Subject: thp: fix interrupt unsafe locking in split_huge_page() split_queue_lock can be taken from interrupt context in some cases, but I forgot to convert locking in split_huge_page() to interrupt-safe primitives. Let's fix this. lockdep output: ====================================================== [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] 4.4.0+ #259 Tainted: G W ------------------------------------------------------ syz-executor/18183 [HC0[0]:SC0[2]:HE0:SE0] is trying to acquire: (split_queue_lock){+.+...}, at: free_transhuge_page+0x24/0x90 mm/huge_memory.c:3436 and this task is already holding: (slock-AF_INET){+.-...}, at: spin_lock_bh include/linux/spinlock.h:307 (slock-AF_INET){+.-...}, at: lock_sock_fast+0x45/0x120 net/core/sock.c:2462 which would create a new lock dependency: (slock-AF_INET){+.-...} -> (split_queue_lock){+.+...} but this new dependency connects a SOFTIRQ-irq-safe lock: (slock-AF_INET){+.-...} ... which became SOFTIRQ-irq-safe at: mark_irqflags kernel/locking/lockdep.c:2799 __lock_acquire+0xfd8/0x4700 kernel/locking/lockdep.c:3162 lock_acquire+0x1dc/0x430 kernel/locking/lockdep.c:3585 __raw_spin_lock include/linux/spinlock_api_smp.h:144 _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:302 udp_queue_rcv_skb+0x781/0x1550 net/ipv4/udp.c:1680 flush_stack+0x50/0x330 net/ipv6/udp.c:799 __udp4_lib_mcast_deliver+0x694/0x7f0 net/ipv4/udp.c:1798 __udp4_lib_rcv+0x17dc/0x23e0 net/ipv4/udp.c:1888 udp_rcv+0x21/0x30 net/ipv4/udp.c:2108 ip_local_deliver_finish+0x2b3/0xa50 net/ipv4/ip_input.c:216 NF_HOOK_THRESH include/linux/netfilter.h:226 NF_HOOK include/linux/netfilter.h:249 ip_local_deliver+0x1c4/0x2f0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:498 ip_rcv_finish+0x5ec/0x1730 net/ipv4/ip_input.c:365 NF_HOOK_THRESH include/linux/netfilter.h:226 NF_HOOK include/linux/netfilter.h:249 ip_rcv+0x963/0x1080 net/ipv4/ip_input.c:455 __netif_receive_skb_core+0x1620/0x2f80 net/core/dev.c:4154 __netif_receive_skb+0x2a/0x160 net/core/dev.c:4189 netif_receive_skb_internal+0x1b5/0x390 net/core/dev.c:4217 napi_skb_finish net/core/dev.c:4542 napi_gro_receive+0x2bd/0x3c0 net/core/dev.c:4572 e1000_clean_rx_irq+0x4e2/0x1100 drivers/net/ethernet/intel/e1000e/netdev.c:1038 e1000_clean+0xa08/0x24a0 drivers/net/ethernet/intel/e1000/e1000_main.c:3819 napi_poll net/core/dev.c:5074 net_rx_action+0x7eb/0xdf0 net/core/dev.c:5139 __do_softirq+0x26a/0x920 kernel/softirq.c:273 invoke_softirq kernel/softirq.c:350 irq_exit+0x18f/0x1d0 kernel/softirq.c:391 exiting_irq ./arch/x86/include/asm/apic.h:659 do_IRQ+0x86/0x1a0 arch/x86/kernel/irq.c:252 ret_from_intr+0x0/0x20 arch/x86/entry/entry_64.S:520 arch_safe_halt ./arch/x86/include/asm/paravirt.h:117 default_idle+0x52/0x2e0 arch/x86/kernel/process.c:304 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:295 default_idle_call+0x48/0xa0 kernel/sched/idle.c:92 cpuidle_idle_call kernel/sched/idle.c:156 cpu_idle_loop kernel/sched/idle.c:252 cpu_startup_entry+0x554/0x710 kernel/sched/idle.c:300 rest_init+0x192/0x1a0 init/main.c:412 start_kernel+0x678/0x69e init/main.c:683 x86_64_start_reservations+0x2a/0x2c arch/x86/kernel/head64.c:195 x86_64_start_kernel+0x158/0x167 arch/x86/kernel/head64.c:184 to a SOFTIRQ-irq-unsafe lock: (split_queue_lock){+.+...} which became SOFTIRQ-irq-unsafe at: mark_irqflags kernel/locking/lockdep.c:2817 __lock_acquire+0x146e/0x4700 kernel/locking/lockdep.c:3162 lock_acquire+0x1dc/0x430 kernel/locking/lockdep.c:3585 __raw_spin_lock include/linux/spinlock_api_smp.h:144 _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:302 split_huge_page_to_list+0xcc0/0x1c50 mm/huge_memory.c:3399 split_huge_page include/linux/huge_mm.h:99 queue_pages_pte_range+0xa38/0xef0 mm/mempolicy.c:507 walk_pmd_range mm/pagewalk.c:50 walk_pud_range mm/pagewalk.c:90 walk_pgd_range mm/pagewalk.c:116 __walk_page_range+0x653/0xcd0 mm/pagewalk.c:204 walk_page_range+0xfe/0x2b0 mm/pagewalk.c:281 queue_pages_range+0xfb/0x130 mm/mempolicy.c:687 migrate_to_node mm/mempolicy.c:1004 do_migrate_pages+0x370/0x4e0 mm/mempolicy.c:1109 SYSC_migrate_pages mm/mempolicy.c:1453 SyS_migrate_pages+0x640/0x730 mm/mempolicy.c:1374 entry_SYSCALL_64_fastpath+0x16/0x7a arch/x86/entry/entry_64.S:185 other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(split_queue_lock); local_irq_disable(); lock(slock-AF_INET); lock(split_queue_lock); lock(slock-AF_INET); Signed-off-by: Kirill A. Shutemov Reported-by: Dmitry Vyukov Acked-by: David Rientjes Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1cf73b..8ad5802 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3357,6 +3357,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma; int count, mapcount, ret; bool mlocked; + unsigned long flags; VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -3396,7 +3397,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) lru_add_drain(); /* Prevent deferred_split_scan() touching ->_count */ - spin_lock(&split_queue_lock); + spin_lock_irqsave(&split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && count == 1) { @@ -3404,11 +3405,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) split_queue_len--; list_del(page_deferred_list(head)); } - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3416,7 +3417,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } else { - spin_unlock(&split_queue_lock); + spin_unlock_irqrestore(&split_queue_lock, flags); unfreeze_page(anon_vma, head); ret = -EBUSY; } -- cgit v0.10.2 From f4be6153cca6c88eaf1e52931d9a010ad4ad940e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 20 Jan 2016 14:58:12 -0800 Subject: fs/proc/task_mmu.c: add workaround for old compilers For THP=n, HPAGE_PMD_NR in smaps_account() expands to BUILD_BUG(). That's fine since this codepath is eliminated by modern compilers. But older compilers have not that efficient dead code elimination. It causes problem at least with gcc 4.1.2 on m68k: fs/built-in.o: In function `smaps_account': task_mmu.c:(.text+0x4f8fa): undefined reference to `__compiletime_assert_471' Let's replace HPAGE_PMD_NR with 1 << compound_order(page). Signed-off-by: Kirill A. Shutemov Reported-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 65a1b6c..71ffc91 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -468,7 +468,7 @@ struct mem_size_stats { static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty) { - int i, nr = compound ? HPAGE_PMD_NR : 1; + int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; if (PageAnon(page)) -- cgit v0.10.2 From dcd6c87cc59af1b4fe7664b35c6344bbe1c9928f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 20 Jan 2016 14:58:15 -0800 Subject: mm: arch: remove duplicate definitions of MADV_FREE Commits 21f55b018ba5 ("arch/*/include/uapi/asm/mman.h: : let MADV_FREE have same value for all architectures") and ef58978f1eaa ("mm: define MADV_FREE for some arches") both defined MADV_FREE, but did not use the same values. This results in build errors such as ./arch/alpha/include/uapi/asm/mman.h:53:0: error: "MADV_FREE" redefined ./arch/alpha/include/uapi/asm/mman.h:50:0: note: this is the location of the previous definition for the affected architectures. Fixes: 21f55b018ba5 ("arch/*/include/uapi/asm/mman.h: : let MADV_FREE have same value for all architectures") Fixes: ef58978f1eaa ("mm: define MADV_FREE for some arches") Signed-off-by: Guenter Roeck Cc: Chen Gang Cc: Minchan Kim Acked-by: Helge Deller [parisc] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index ab336c0..fec1947 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -47,7 +47,6 @@ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_SPACEAVAIL 5 /* ensure resources are available */ #define MADV_DONTNEED 6 /* don't need these pages */ -#define MADV_FREE 7 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_FREE 8 /* free pages only if memory pressure */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index b0ebe59..ccdcfcb 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -73,7 +73,6 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ -#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_FREE 8 /* free pages only if memory pressure */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index cf830d4..f3db7d8 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -43,7 +43,6 @@ #define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ #define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ #define MADV_VPS_INHERIT 7 /* Inherit parents page size */ -#define MADV_FREE 8 /* free pages only if memory pressure */ /* common/generic parameters */ #define MADV_FREE 8 /* free pages only if memory pressure */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index d030594..9e079d4 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -86,7 +86,6 @@ #define MADV_SEQUENTIAL 2 /* expect sequential page references */ #define MADV_WILLNEED 3 /* will need these pages */ #define MADV_DONTNEED 4 /* don't need these pages */ -#define MADV_FREE 5 /* free pages only if memory pressure */ /* common parameters: try to keep these consistent across architectures */ #define MADV_FREE 8 /* free pages only if memory pressure */ -- cgit v0.10.2 From c102f07ca0b04f2cb49cfc161c83f6239d17f491 Mon Sep 17 00:00:00 2001 From: Junil Lee Date: Wed, 20 Jan 2016 14:58:18 -0800 Subject: zsmalloc: fix migrate_zspage-zs_free race condition record_obj() in migrate_zspage() does not preserve handle's HANDLE_PIN_BIT, set by find_aloced_obj()->trypin_tag(), and implicitly (accidentally) un-pins the handle, while migrate_zspage() still performs an explicit unpin_tag() on the that handle. This additional explicit unpin_tag() introduces a race condition with zs_free(), which can pin that handle by this time, so the handle becomes un-pinned. Schematically, it goes like this: CPU0 CPU1 migrate_zspage find_alloced_obj trypin_tag set HANDLE_PIN_BIT zs_free() pin_tag() obj_malloc() -- new object, no tag record_obj() -- remove HANDLE_PIN_BIT set HANDLE_PIN_BIT unpin_tag() -- remove zs_free's HANDLE_PIN_BIT The race condition may result in a NULL pointer dereference: Unable to handle kernel NULL pointer dereference at virtual address 00000000 CPU: 0 PID: 19001 Comm: CookieMonsterCl Tainted: PC is at get_zspage_mapping+0x0/0x24 LR is at obj_free.isra.22+0x64/0x128 Call trace: get_zspage_mapping+0x0/0x24 zs_free+0x88/0x114 zram_free_page+0x64/0xcc zram_slot_free_notify+0x90/0x108 swap_entry_free+0x278/0x294 free_swap_and_cache+0x38/0x11c unmap_single_vma+0x480/0x5c8 unmap_vmas+0x44/0x60 exit_mmap+0x50/0x110 mmput+0x58/0xe0 do_exit+0x320/0x8dc do_group_exit+0x44/0xa8 get_signal+0x538/0x580 do_signal+0x98/0x4b8 do_notify_resume+0x14/0x5c This patch keeps the lock bit in migration path and update value atomically. Signed-off-by: Junil Lee Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Cc: Sergey Senozhatsky Cc: [4.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e7414ce..2d7c4c1 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -309,7 +309,12 @@ static void free_handle(struct zs_pool *pool, unsigned long handle) static void record_obj(unsigned long handle, unsigned long obj) { - *(unsigned long *)handle = obj; + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); } /* zpool driver */ @@ -1635,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(d_page, class, handle); zs_object_copy(free_obj, used_obj, class); index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); -- cgit v0.10.2 From 75339d825ae5b1cd5bd6d4b1db3b4b7c12d7b3e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Jan 2016 14:58:21 -0800 Subject: misc: ibmasm: fix build errors Fix build when CONFIG_SERIAL_8250=m and CONFIG_IBM_ASM=y. Fixes these build errors: drivers/built-in.o: In function `ibmasm_remove_one': module.c:(.text+0xf6874): undefined reference to `ibmasm_unregister_uart' drivers/built-in.o: In function `ibmasm_init_one': module.c:(.text+0xf6c37): undefined reference to `ibmasm_register_uart' Signed-off-by: Randy Dunlap Cc: Max Asbock Cc: Vernon Mauery Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 22892c7..054fc10 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -95,6 +95,7 @@ config DUMMY_IRQ config IBM_ASM tristate "Device driver for IBM RSA service processor" depends on X86 && PCI && INPUT + depends on SERIAL_8250 || SERIAL_8250=n ---help--- This option enables device driver support for in-band access to the IBM RSA (Condor) service processor in eServer xSeries systems. -- cgit v0.10.2 From be17bddc6907f65b0a9f0ff77200a6d403ac41d1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 20 Jan 2016 14:58:24 -0800 Subject: scripts/get_maintainer.pl: handle file names beginning with ./ The problem is that get_maintainer.pl doesn't work if you have a ./ prefix on the filename. For example, if you type: ./scripts/get_maintainer.pl -f ./drivers/usb/usb-skeleton.c then the current code only includes LKML and people from the git log, it doesn't include Greg or the linux-usb list. Reported-by: Dan Carpenter Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index cab641a..1873421 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -16,7 +16,9 @@ my $P = $0; my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); +use Cwd; +my $cur_path = fastgetcwd() . '/'; my $lk_path = "./"; my $email = 1; my $email_usename = 1; @@ -429,6 +431,8 @@ foreach my $file (@ARGV) { } } if ($from_filename) { + $file =~ s/^\Q${cur_path}\E//; #strip any absolute path + $file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree push(@files, $file); if ($file ne "MAINTAINERS" && -f $file && ($keywords || $file_emails)) { open(my $f, '<', $file) -- cgit v0.10.2 From a4cc3c3c7356ded3eba5905f94279382a05d9c96 Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Wed, 20 Jan 2016 14:58:26 -0800 Subject: ./CREDITS: add credit information for Martin Kepplinger Signed-off-by: Martin Kepplinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/CREDITS b/CREDITS index 25133c5..a3887b5 100644 --- a/CREDITS +++ b/CREDITS @@ -1856,6 +1856,16 @@ S: Korte Heul 95 S: 1403 ND BUSSUM S: The Netherlands +N: Martin Kepplinger +E: martink@posteo.de +E: martin.kepplinger@theobroma-systems.com +W: http://www.martinkepplinger.com +D: mma8452 accelerators iio driver +D: Kernel cleanups +S: Garnisonstraße 26 +S: 4020 Linz +S: Austria + N: Karl Keyte E: karl@koft.com D: Disk usage statistics and modifications to line printer driver -- cgit v0.10.2 From 564b026fbd0d28e9f70fb3831293d2922bb7855b Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 20 Jan 2016 14:58:29 -0800 Subject: string_helpers: fix precision loss for some inputs It was noticed that we lose precision in the final calculation for some inputs. The most egregious example is size=3000 blk_size=1900 in units of 10 should yield 5.70 MB but in fact yields 3.00 MB (oops). This is because the current algorithm doesn't correctly account for all the remainders in the logarithms. Fix this by doing a correct calculation in the remainders based on napier's algorithm. Additionally, now we have the correct result, we have to account for arithmetic rounding because we're printing 3 digits of precision. This means that if the fourth digit is five or greater, we have to round up, so add a section to ensure correct rounding. Finally account for all possible inputs correctly, including zero for block size. Fixes: b9f28d863594c429e1df35a0474d2663ca28b307 Signed-off-by: James Bottomley Reported-by: Vitaly Kuznetsov Cc: [delay until after 4.4 release] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 5939f63..5c88204 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -43,50 +43,73 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, [STRING_UNITS_10] = 1000, [STRING_UNITS_2] = 1024, }; - int i, j; - u32 remainder = 0, sf_cap, exp; + static const unsigned int rounding[] = { 500, 50, 5 }; + int i = 0, j; + u32 remainder = 0, sf_cap; char tmp[8]; const char *unit; tmp[0] = '\0'; - i = 0; - if (!size) + + if (blk_size == 0) + size = 0; + if (size == 0) goto out; - while (blk_size >= divisor[units]) { - remainder = do_div(blk_size, divisor[units]); + /* This is Napier's algorithm. Reduce the original block size to + * + * coefficient * divisor[units]^i + * + * we do the reduction so both coefficients are just under 32 bits so + * that multiplying them together won't overflow 64 bits and we keep + * as much precision as possible in the numbers. + * + * Note: it's safe to throw away the remainders here because all the + * precision is in the coefficients. + */ + while (blk_size >> 32) { + do_div(blk_size, divisor[units]); i++; } - exp = divisor[units] / (u32)blk_size; - /* - * size must be strictly greater than exp here to ensure that remainder - * is greater than divisor[units] coming out of the if below. - */ - if (size > exp) { - remainder = do_div(size, divisor[units]); - remainder *= blk_size; + while (size >> 32) { + do_div(size, divisor[units]); i++; - } else { - remainder *= size; } + /* now perform the actual multiplication keeping i as the sum of the + * two logarithms */ size *= blk_size; - size += remainder / divisor[units]; - remainder %= divisor[units]; + /* and logarithmically reduce it until it's just under the divisor */ while (size >= divisor[units]) { remainder = do_div(size, divisor[units]); i++; } + /* work out in j how many digits of precision we need from the + * remainder */ sf_cap = size; for (j = 0; sf_cap*10 < 1000; j++) sf_cap *= 10; - if (j) { + if (units == STRING_UNITS_2) { + /* express the remainder as a decimal. It's currently the + * numerator of a fraction whose denominator is + * divisor[units], which is 1 << 10 for STRING_UNITS_2 */ remainder *= 1000; - remainder /= divisor[units]; + remainder >>= 10; + } + + /* add a 5 to the digit below what will be printed to ensure + * an arithmetical round up and carry it through to size */ + remainder += rounding[j]; + if (remainder >= 1000) { + remainder -= 1000; + size += 1; + } + + if (j) { snprintf(tmp, sizeof(tmp), ".%03u", remainder); tmp[j+1] = '\0'; } -- cgit v0.10.2 From ef5c16b85b2a6d55ae12a8d7125f3908c8d271d4 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Jan 2016 14:58:32 -0800 Subject: arch/frv/include/asm/io.h: accept const void pointers for read{b,w,l}() The SMD driver is reading and writing chunks of data to iomem, and there's an __iowrite32_copy() function for the writing part, but no __ioread32_copy() function for the reading part. This series adds __ioread32_copy() and uses it in two places. This patch (of 4): The frv port uses compiler builtins, __builtin_read*(), for the I/O read routines. Unfortunately, these don't accept const void pointers although the generic ASM implementations do, so generic code passing const pointers to these APIs cause compilers to emit warnings. Add wrapper functions that cast away the const to avoid the warnings. Signed-off-by: Stephen Boyd Cc: David Howells Cc: Cc: Bjorn Andersson Cc: Hauke Mehrtens Cc: Paul Walmsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/frv/include/asm/io.h b/arch/frv/include/asm/io.h index 70dfbea..8062fc7 100644 --- a/arch/frv/include/asm/io.h +++ b/arch/frv/include/asm/io.h @@ -43,9 +43,20 @@ static inline unsigned long _swapl(unsigned long v) //#define __iormb() asm volatile("membar") //#define __iowmb() asm volatile("membar") -#define __raw_readb __builtin_read8 -#define __raw_readw __builtin_read16 -#define __raw_readl __builtin_read32 +static inline u8 __raw_readb(const volatile void __iomem *addr) +{ + return __builtin_read8((volatile void __iomem *)addr); +} + +static inline u16 __raw_readw(const volatile void __iomem *addr) +{ + return __builtin_read16((volatile void __iomem *)addr); +} + +static inline u32 __raw_readl(const volatile void __iomem *addr) +{ + return __builtin_read32((volatile void __iomem *)addr); +} #define __raw_writeb(datum, addr) __builtin_write8(addr, datum) #define __raw_writew(datum, addr) __builtin_write16(addr, datum) -- cgit v0.10.2 From a9aec5881b9d4aca184b29d33484a6a58d23f7f2 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Jan 2016 14:58:35 -0800 Subject: lib/iomap_copy.c: add __ioread32_copy() Some drivers need to read data out of iomem areas 32-bits at a time. Add an API to do this. Signed-off-by: Stephen Boyd Cc: Bjorn Andersson Cc: Cc: David Howells Cc: Hauke Mehrtens Cc: Paul Walmsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/io.h b/include/linux/io.h index fffd88d..32403b5 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -29,6 +29,7 @@ struct device; struct resource; __visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count); +void __ioread32_copy(void *to, const void __iomem *from, size_t count); void __iowrite64_copy(void __iomem *to, const void *from, size_t count); #ifdef CONFIG_MMU diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c index 4527e75..b8f1d6c 100644 --- a/lib/iomap_copy.c +++ b/lib/iomap_copy.c @@ -42,6 +42,27 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to, EXPORT_SYMBOL_GPL(__iowrite32_copy); /** + * __ioread32_copy - copy data from MMIO space, in 32-bit units + * @to: destination (must be 32-bit aligned) + * @from: source, in MMIO space (must be 32-bit aligned) + * @count: number of 32-bit quantities to copy + * + * Copy data from MMIO space to kernel space, in units of 32 bits at a + * time. Order of access is not guaranteed, nor is a memory barrier + * performed afterwards. + */ +void __ioread32_copy(void *to, const void __iomem *from, size_t count) +{ + u32 *dst = to; + const u32 __iomem *src = from; + const u32 __iomem *end = src + count; + + while (src < end) + *dst++ = __raw_readl(src++); +} +EXPORT_SYMBOL_GPL(__ioread32_copy); + +/** * __iowrite64_copy - copy data to MMIO space, in 64-bit or 32-bit units * @to: destination, in MMIO space (must be 64-bit aligned) * @from: source (must be 64-bit aligned) -- cgit v0.10.2 From c431e678127578c1b86fe976556d79dd669ad953 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Jan 2016 14:58:38 -0800 Subject: drivers/soc/qcom/smd.c: use __ioread32_copy() instead of open-coding it Now that we have a generic library function for this, replace the open-coded instance. Signed-off-by: Stephen Boyd Reviewed-by: Bjorn Andersson Cc: Cc: David Howells Cc: Hauke Mehrtens Cc: Paul Walmsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c index 86b598c..498fd05 100644 --- a/drivers/soc/qcom/smd.c +++ b/drivers/soc/qcom/smd.c @@ -434,20 +434,15 @@ static void smd_copy_to_fifo(void __iomem *dst, /* * Copy count bytes of data using 32bit accesses, if that is required. */ -static void smd_copy_from_fifo(void *_dst, - const void __iomem *_src, +static void smd_copy_from_fifo(void *dst, + const void __iomem *src, size_t count, bool word_aligned) { - u32 *dst = (u32 *)_dst; - u32 *src = (u32 *)_src; - if (word_aligned) { - count /= sizeof(u32); - while (count--) - *dst++ = __raw_readl(src++); + __ioread32_copy(dst, src, count / sizeof(u32)); } else { - memcpy_fromio(_dst, _src, count); + memcpy_fromio(dst, src, count); } } -- cgit v0.10.2 From 1f330c3279004dd83fcbac5669a858366c51e058 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 20 Jan 2016 14:58:41 -0800 Subject: drivers/firmware/broadcom/bcm47xx_nvram.c: use __ioread32_copy() instead of open-coding Now that we have a generic library function for this, replace the open-coded instance. Signed-off-by: Stephen Boyd Cc: Hauke Mehrtens Cc: Cc: Paul Walmsley Cc: Bjorn Andersson Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/firmware/broadcom/bcm47xx_nvram.c b/drivers/firmware/broadcom/bcm47xx_nvram.c index e415945..0c2f0a6 100644 --- a/drivers/firmware/broadcom/bcm47xx_nvram.c +++ b/drivers/firmware/broadcom/bcm47xx_nvram.c @@ -56,9 +56,7 @@ static u32 find_nvram_size(void __iomem *end) static int nvram_find_and_copy(void __iomem *iobase, u32 lim) { struct nvram_header __iomem *header; - int i; u32 off; - u32 *src, *dst; u32 size; if (nvram_len) { @@ -95,10 +93,7 @@ static int nvram_find_and_copy(void __iomem *iobase, u32 lim) return -ENXIO; found: - src = (u32 *)header; - dst = (u32 *)nvram_buf; - for (i = 0; i < sizeof(struct nvram_header); i += 4) - *dst++ = __raw_readl(src++); + __ioread32_copy(nvram_buf, header, sizeof(*header) / 4); header = (struct nvram_header *)nvram_buf; nvram_len = header->len; if (nvram_len > size) { @@ -111,8 +106,8 @@ found: nvram_len = NVRAM_SPACE - 1; } /* proceed reading data after header */ - for (; i < nvram_len; i += 4) - *dst++ = readl(src++); + __ioread32_copy(nvram_buf + sizeof(*header), header + 1, + DIV_ROUND_UP(nvram_len, 4)); nvram_buf[NVRAM_SPACE - 1] = '\0'; return 0; -- cgit v0.10.2 From 60b2e8f4f71a21b96306a8a3ea4dd345ea3bfb46 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:44 -0800 Subject: test_hexdump: rename to test_hexdump The test suite currently doesn't cover many corner cases when hex_dump_to_buffer() runs into overflow. Refactor and amend test suite to cover most of the cases. This patch (of 9): Just to follow the scheme that most of the test modules are using. There is no fuctional change. Signed-off-by: Andy Shevchenko Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Makefile b/lib/Makefile index 180dd4d..fdeb7e3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -31,7 +31,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o -obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o +obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c deleted file mode 100644 index 5241df3..0000000 --- a/lib/test-hexdump.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Test cases for lib/hexdump.c module. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include - -static const unsigned char data_b[] = { - '\xbe', '\x32', '\xdb', '\x7b', '\x0a', '\x18', '\x93', '\xb2', /* 00 - 07 */ - '\x70', '\xba', '\xc4', '\x24', '\x7d', '\x83', '\x34', '\x9b', /* 08 - 0f */ - '\xa6', '\x9c', '\x31', '\xad', '\x9c', '\x0f', '\xac', '\xe9', /* 10 - 17 */ - '\x4c', '\xd1', '\x19', '\x99', '\x43', '\xb1', '\xaf', '\x0c', /* 18 - 1f */ -}; - -static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; - -static const char * const test_data_1_le[] __initconst = { - "be", "32", "db", "7b", "0a", "18", "93", "b2", - "70", "ba", "c4", "24", "7d", "83", "34", "9b", - "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", - "4c", "d1", "19", "99", "43", "b1", "af", "0c", -}; - -static const char * const test_data_2_le[] __initconst = { - "32be", "7bdb", "180a", "b293", - "ba70", "24c4", "837d", "9b34", - "9ca6", "ad31", "0f9c", "e9ac", - "d14c", "9919", "b143", "0caf", -}; - -static const char * const test_data_4_le[] __initconst = { - "7bdb32be", "b293180a", "24c4ba70", "9b34837d", - "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", -}; - -static const char * const test_data_8_le[] __initconst = { - "b293180a7bdb32be", "9b34837d24c4ba70", - "e9ac0f9cad319ca6", "0cafb1439919d14c", -}; - -static void __init test_hexdump(size_t len, int rowsize, int groupsize, - bool ascii) -{ - char test[32 * 3 + 2 + 32 + 1]; - char real[32 * 3 + 2 + 32 + 1]; - char *p; - const char * const *result; - size_t l = len; - int gs = groupsize, rs = rowsize; - unsigned int i; - - hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii); - - if (rs != 16 && rs != 32) - rs = 16; - - if (l > rs) - l = rs; - - if (!is_power_of_2(gs) || gs > 8 || (len % gs != 0)) - gs = 1; - - if (gs == 8) - result = test_data_8_le; - else if (gs == 4) - result = test_data_4_le; - else if (gs == 2) - result = test_data_2_le; - else - result = test_data_1_le; - - memset(test, ' ', sizeof(test)); - - /* hex dump */ - p = test; - for (i = 0; i < l / gs; i++) { - const char *q = *result++; - size_t amount = strlen(q); - - strncpy(p, q, amount); - p += amount + 1; - } - if (i) - p--; - - /* ASCII part */ - if (ascii) { - p = test + rs * 2 + rs / gs + 1; - strncpy(p, data_a, l); - p += l; - } - - *p = '\0'; - - if (strcmp(test, real)) { - pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); - pr_err("Result: '%s'\n", real); - pr_err("Expect: '%s'\n", test); - } -} - -static void __init test_hexdump_set(int rowsize, bool ascii) -{ - size_t d = min_t(size_t, sizeof(data_b), rowsize); - size_t len = get_random_int() % d + 1; - - test_hexdump(len, rowsize, 4, ascii); - test_hexdump(len, rowsize, 2, ascii); - test_hexdump(len, rowsize, 8, ascii); - test_hexdump(len, rowsize, 1, ascii); -} - -static void __init test_hexdump_overflow(bool ascii) -{ - char buf[56]; - const char *t = test_data_1_le[0]; - size_t l = get_random_int() % sizeof(buf); - bool a; - int e, r; - - memset(buf, ' ', sizeof(buf)); - - r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); - - if (ascii) - e = 50; - else - e = 2; - buf[e + 2] = '\0'; - - if (!l) { - a = r == e && buf[0] == ' '; - } else if (l < 3) { - a = r == e && buf[0] == '\0'; - } else if (l < 4) { - a = r == e && !strcmp(buf, t); - } else if (ascii) { - if (l < 51) - a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; - else - a = r == e && buf[50] == '\0' && buf[49] == '.'; - } else { - a = r == e && buf[e] == '\0'; - } - - if (!a) { - pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf)); - pr_err("Result: '%s'\n", buf); - } -} - -static int __init test_hexdump_init(void) -{ - unsigned int i; - int rowsize; - - pr_info("Running tests...\n"); - - rowsize = (get_random_int() % 2 + 1) * 16; - for (i = 0; i < 16; i++) - test_hexdump_set(rowsize, false); - - rowsize = (get_random_int() % 2 + 1) * 16; - for (i = 0; i < 16; i++) - test_hexdump_set(rowsize, true); - - for (i = 0; i < 16; i++) - test_hexdump_overflow(false); - - for (i = 0; i < 16; i++) - test_hexdump_overflow(true); - - return -EINVAL; -} -module_init(test_hexdump_init); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c new file mode 100644 index 0000000..5241df3 --- /dev/null +++ b/lib/test_hexdump.c @@ -0,0 +1,180 @@ +/* + * Test cases for lib/hexdump.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +static const unsigned char data_b[] = { + '\xbe', '\x32', '\xdb', '\x7b', '\x0a', '\x18', '\x93', '\xb2', /* 00 - 07 */ + '\x70', '\xba', '\xc4', '\x24', '\x7d', '\x83', '\x34', '\x9b', /* 08 - 0f */ + '\xa6', '\x9c', '\x31', '\xad', '\x9c', '\x0f', '\xac', '\xe9', /* 10 - 17 */ + '\x4c', '\xd1', '\x19', '\x99', '\x43', '\xb1', '\xaf', '\x0c', /* 18 - 1f */ +}; + +static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; + +static const char * const test_data_1_le[] __initconst = { + "be", "32", "db", "7b", "0a", "18", "93", "b2", + "70", "ba", "c4", "24", "7d", "83", "34", "9b", + "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", + "4c", "d1", "19", "99", "43", "b1", "af", "0c", +}; + +static const char * const test_data_2_le[] __initconst = { + "32be", "7bdb", "180a", "b293", + "ba70", "24c4", "837d", "9b34", + "9ca6", "ad31", "0f9c", "e9ac", + "d14c", "9919", "b143", "0caf", +}; + +static const char * const test_data_4_le[] __initconst = { + "7bdb32be", "b293180a", "24c4ba70", "9b34837d", + "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", +}; + +static const char * const test_data_8_le[] __initconst = { + "b293180a7bdb32be", "9b34837d24c4ba70", + "e9ac0f9cad319ca6", "0cafb1439919d14c", +}; + +static void __init test_hexdump(size_t len, int rowsize, int groupsize, + bool ascii) +{ + char test[32 * 3 + 2 + 32 + 1]; + char real[32 * 3 + 2 + 32 + 1]; + char *p; + const char * const *result; + size_t l = len; + int gs = groupsize, rs = rowsize; + unsigned int i; + + hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii); + + if (rs != 16 && rs != 32) + rs = 16; + + if (l > rs) + l = rs; + + if (!is_power_of_2(gs) || gs > 8 || (len % gs != 0)) + gs = 1; + + if (gs == 8) + result = test_data_8_le; + else if (gs == 4) + result = test_data_4_le; + else if (gs == 2) + result = test_data_2_le; + else + result = test_data_1_le; + + memset(test, ' ', sizeof(test)); + + /* hex dump */ + p = test; + for (i = 0; i < l / gs; i++) { + const char *q = *result++; + size_t amount = strlen(q); + + strncpy(p, q, amount); + p += amount + 1; + } + if (i) + p--; + + /* ASCII part */ + if (ascii) { + p = test + rs * 2 + rs / gs + 1; + strncpy(p, data_a, l); + p += l; + } + + *p = '\0'; + + if (strcmp(test, real)) { + pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); + pr_err("Result: '%s'\n", real); + pr_err("Expect: '%s'\n", test); + } +} + +static void __init test_hexdump_set(int rowsize, bool ascii) +{ + size_t d = min_t(size_t, sizeof(data_b), rowsize); + size_t len = get_random_int() % d + 1; + + test_hexdump(len, rowsize, 4, ascii); + test_hexdump(len, rowsize, 2, ascii); + test_hexdump(len, rowsize, 8, ascii); + test_hexdump(len, rowsize, 1, ascii); +} + +static void __init test_hexdump_overflow(bool ascii) +{ + char buf[56]; + const char *t = test_data_1_le[0]; + size_t l = get_random_int() % sizeof(buf); + bool a; + int e, r; + + memset(buf, ' ', sizeof(buf)); + + r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); + + if (ascii) + e = 50; + else + e = 2; + buf[e + 2] = '\0'; + + if (!l) { + a = r == e && buf[0] == ' '; + } else if (l < 3) { + a = r == e && buf[0] == '\0'; + } else if (l < 4) { + a = r == e && !strcmp(buf, t); + } else if (ascii) { + if (l < 51) + a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; + else + a = r == e && buf[50] == '\0' && buf[49] == '.'; + } else { + a = r == e && buf[e] == '\0'; + } + + if (!a) { + pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf)); + pr_err("Result: '%s'\n", buf); + } +} + +static int __init test_hexdump_init(void) +{ + unsigned int i; + int rowsize; + + pr_info("Running tests...\n"); + + rowsize = (get_random_int() % 2 + 1) * 16; + for (i = 0; i < 16; i++) + test_hexdump_set(rowsize, false); + + rowsize = (get_random_int() % 2 + 1) * 16; + for (i = 0; i < 16; i++) + test_hexdump_set(rowsize, true); + + for (i = 0; i < 16; i++) + test_hexdump_overflow(false); + + for (i = 0; i < 16; i++) + test_hexdump_overflow(true); + + return -EINVAL; +} +module_init(test_hexdump_init); +MODULE_LICENSE("Dual BSD/GPL"); -- cgit v0.10.2 From 87977ca6bcd051b8bd20adff0a023548ff25902c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:47 -0800 Subject: test_hexdump: introduce test_hexdump_prepare_test() helper The function prepares the expected result in the provided buffer. Signed-off-by: Andy Shevchenko Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 5241df3..ed7c6a7 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -42,19 +42,16 @@ static const char * const test_data_8_le[] __initconst = { "e9ac0f9cad319ca6", "0cafb1439919d14c", }; -static void __init test_hexdump(size_t len, int rowsize, int groupsize, - bool ascii) +static void __init test_hexdump_prepare_test(size_t len, int rowsize, + int groupsize, char *test, + size_t testlen, bool ascii) { - char test[32 * 3 + 2 + 32 + 1]; - char real[32 * 3 + 2 + 32 + 1]; char *p; const char * const *result; size_t l = len; int gs = groupsize, rs = rowsize; unsigned int i; - hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii); - if (rs != 16 && rs != 32) rs = 16; @@ -73,7 +70,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, else result = test_data_1_le; - memset(test, ' ', sizeof(test)); + memset(test, ' ', testlen); /* hex dump */ p = test; @@ -95,6 +92,21 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, } *p = '\0'; +} + +#define TEST_HEXDUMP_BUF_SIZE (32 * 3 + 2 + 32 + 1) + +static void __init test_hexdump(size_t len, int rowsize, int groupsize, + bool ascii) +{ + char test[TEST_HEXDUMP_BUF_SIZE]; + char real[TEST_HEXDUMP_BUF_SIZE]; + + hex_dump_to_buffer(data_b, len, rowsize, groupsize, real, sizeof(real), + ascii); + + test_hexdump_prepare_test(len, rowsize, groupsize, test, sizeof(test), + ascii); if (strcmp(test, real)) { pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); -- cgit v0.10.2 From 3db4a987180acfba3bc117575bfedb81e055778c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:50 -0800 Subject: test_hexdump: define FILL_CHAR constant Define a character to fill the test buffers. Though the character should be printable since it's used when errors are reported. It should neither be from hex digit [a-fA-F0-9] dictionary nor space. It is recommended not to use one which is present in ASCII part of the test data. Later on we might switch to unprintable character to make test case more robust. Signed-off-by: Andy Shevchenko Suggested-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index ed7c6a7..1ecdb97 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -42,6 +42,8 @@ static const char * const test_data_8_le[] __initconst = { "e9ac0f9cad319ca6", "0cafb1439919d14c", }; +#define FILL_CHAR '#' + static void __init test_hexdump_prepare_test(size_t len, int rowsize, int groupsize, char *test, size_t testlen, bool ascii) @@ -70,7 +72,7 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, else result = test_data_1_le; - memset(test, ' ', testlen); + memset(test, FILL_CHAR, testlen); /* hex dump */ p = test; @@ -79,14 +81,19 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, size_t amount = strlen(q); strncpy(p, q, amount); - p += amount + 1; + p += amount; + + *p++ = ' '; } if (i) p--; /* ASCII part */ if (ascii) { - p = test + rs * 2 + rs / gs + 1; + do { + *p++ = ' '; + } while (p < test + rs * 2 + rs / gs + 1); + strncpy(p, data_a, l); p += l; } @@ -134,7 +141,7 @@ static void __init test_hexdump_overflow(bool ascii) bool a; int e, r; - memset(buf, ' ', sizeof(buf)); + memset(buf, FILL_CHAR, sizeof(buf)); r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); @@ -145,14 +152,14 @@ static void __init test_hexdump_overflow(bool ascii) buf[e + 2] = '\0'; if (!l) { - a = r == e && buf[0] == ' '; + a = r == e && buf[0] == FILL_CHAR; } else if (l < 3) { a = r == e && buf[0] == '\0'; } else if (l < 4) { a = r == e && !strcmp(buf, t); } else if (ascii) { if (l < 51) - a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; + a = r == e && buf[l - 1] == '\0' && buf[l - 2] == FILL_CHAR; else a = r == e && buf[50] == '\0' && buf[49] == '.'; } else { -- cgit v0.10.2 From a3d601fcc2f94fd1583053a1b1aea5de66ffc79c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:53 -0800 Subject: test_hexdump: go through all possible lengths of buffer When test for overflow do iterate the buffer length in a range 0 .. BUF_SIZE. Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 1ecdb97..940b1d3 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -133,17 +133,16 @@ static void __init test_hexdump_set(int rowsize, bool ascii) test_hexdump(len, rowsize, 1, ascii); } -static void __init test_hexdump_overflow(bool ascii) +static void __init test_hexdump_overflow(size_t buflen, bool ascii) { - char buf[56]; + char buf[TEST_HEXDUMP_BUF_SIZE]; const char *t = test_data_1_le[0]; - size_t l = get_random_int() % sizeof(buf); bool a; int e, r; memset(buf, FILL_CHAR, sizeof(buf)); - r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); + r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, buflen, ascii); if (ascii) e = 50; @@ -151,15 +150,15 @@ static void __init test_hexdump_overflow(bool ascii) e = 2; buf[e + 2] = '\0'; - if (!l) { + if (!buflen) { a = r == e && buf[0] == FILL_CHAR; - } else if (l < 3) { + } else if (buflen < 3) { a = r == e && buf[0] == '\0'; - } else if (l < 4) { + } else if (buflen < 4) { a = r == e && !strcmp(buf, t); } else if (ascii) { - if (l < 51) - a = r == e && buf[l - 1] == '\0' && buf[l - 2] == FILL_CHAR; + if (buflen < 51) + a = r == e && buf[buflen - 1] == '\0' && buf[buflen - 2] == FILL_CHAR; else a = r == e && buf[50] == '\0' && buf[49] == '.'; } else { @@ -167,7 +166,7 @@ static void __init test_hexdump_overflow(bool ascii) } if (!a) { - pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf)); + pr_err("Len: %zu rc: %u strlen: %zu\n", buflen, r, strlen(buf)); pr_err("Result: '%s'\n", buf); } } @@ -187,11 +186,11 @@ static int __init test_hexdump_init(void) for (i = 0; i < 16; i++) test_hexdump_set(rowsize, true); - for (i = 0; i < 16; i++) - test_hexdump_overflow(false); + for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++) + test_hexdump_overflow(i, false); - for (i = 0; i < 16; i++) - test_hexdump_overflow(true); + for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++) + test_hexdump_overflow(i, true); return -EINVAL; } -- cgit v0.10.2 From ad27a7559a85309a4775389d012f3728c92f5eb0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:56 -0800 Subject: test_hexdump: replace magic numbers by their meaning The magic numbers of the length are converted to their actual meaning, such as end of the buffer with and without ASCII part. We don't touch the rest of the magic constants that will be removed in the following commits. Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 940b1d3..141d031 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -137,17 +137,26 @@ static void __init test_hexdump_overflow(size_t buflen, bool ascii) { char buf[TEST_HEXDUMP_BUF_SIZE]; const char *t = test_data_1_le[0]; + size_t len = 1; + int rs = 16, gs = 1; + int ae, he, e, r; bool a; - int e, r; memset(buf, FILL_CHAR, sizeof(buf)); - r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, buflen, ascii); + r = hex_dump_to_buffer(data_b, len, rs, gs, buf, buflen, ascii); + + /* + * Caller must provide the data length multiple of groupsize. The + * calculations below are made with that assumption in mind. + */ + ae = rs * 2 /* hex */ + rs / gs /* spaces */ + 1 /* space */ + len /* ascii */; + he = (gs * 2 /* hex */ + 1 /* space */) * len / gs - 1 /* no trailing space */; if (ascii) - e = 50; + e = ae; else - e = 2; + e = he; buf[e + 2] = '\0'; if (!buflen) { -- cgit v0.10.2 From 7047d813718c8e40929b7267a8d20cbf212f8565 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:58:58 -0800 Subject: test_hexdump: switch to memcmp() Better to use memcmp() against entire buffer to check that nothing is happened to the data in the tail. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 141d031..4b949aa 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -72,8 +72,6 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, else result = test_data_1_le; - memset(test, FILL_CHAR, testlen); - /* hex dump */ p = test; for (i = 0; i < l / gs; i++) { @@ -109,13 +107,15 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, char test[TEST_HEXDUMP_BUF_SIZE]; char real[TEST_HEXDUMP_BUF_SIZE]; + memset(real, FILL_CHAR, sizeof(real)); hex_dump_to_buffer(data_b, len, rowsize, groupsize, real, sizeof(real), ascii); + memset(test, FILL_CHAR, sizeof(test)); test_hexdump_prepare_test(len, rowsize, groupsize, test, sizeof(test), ascii); - if (strcmp(test, real)) { + if (memcmp(test, real, TEST_HEXDUMP_BUF_SIZE)) { pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); pr_err("Result: '%s'\n", real); pr_err("Expect: '%s'\n", test); -- cgit v0.10.2 From cc77a719a5cfd419d057277fd0fdfca568bcfdd2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:59:01 -0800 Subject: test_hexdump: check all bytes in real buffer After processing by hex_dump_to_buffer() check all the parts to be expected. Part 1. The actual expected hex dump with or without ASCII part. Part 2. Check if the buffer is dirty beyond needed. Part 3. Return code should be as expected. This is done by using comparison of the return code and memcmp() against the test buffer. We fill the buffer by FILL_CHAR ('#') characters, so, we expect to have a tail of the buffer will be left untouched. The terminating NUL is also checked by memcmp(). Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 4b949aa..16a7593 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -135,11 +135,10 @@ static void __init test_hexdump_set(int rowsize, bool ascii) static void __init test_hexdump_overflow(size_t buflen, bool ascii) { + char test[TEST_HEXDUMP_BUF_SIZE]; char buf[TEST_HEXDUMP_BUF_SIZE]; - const char *t = test_data_1_le[0]; - size_t len = 1; - int rs = 16, gs = 1; - int ae, he, e, r; + int rs = rowsize, gs = groupsize; + int ae, he, e, f, r; bool a; memset(buf, FILL_CHAR, sizeof(buf)); @@ -157,26 +156,23 @@ static void __init test_hexdump_overflow(size_t buflen, bool ascii) e = ae; else e = he; - buf[e + 2] = '\0'; - - if (!buflen) { - a = r == e && buf[0] == FILL_CHAR; - } else if (buflen < 3) { - a = r == e && buf[0] == '\0'; - } else if (buflen < 4) { - a = r == e && !strcmp(buf, t); - } else if (ascii) { - if (buflen < 51) - a = r == e && buf[buflen - 1] == '\0' && buf[buflen - 2] == FILL_CHAR; - else - a = r == e && buf[50] == '\0' && buf[49] == '.'; - } else { - a = r == e && buf[e] == '\0'; + + f = min_t(int, e + 1, buflen); + if (buflen) { + test_hexdump_prepare_test(len, rs, gs, test, sizeof(test), ascii); + test[f - 1] = '\0'; } + memset(test + f, FILL_CHAR, sizeof(test) - f); + + a = r == e && !memcmp(test, buf, TEST_HEXDUMP_BUF_SIZE); + + buf[sizeof(buf) - 1] = '\0'; if (!a) { - pr_err("Len: %zu rc: %u strlen: %zu\n", buflen, r, strlen(buf)); - pr_err("Result: '%s'\n", buf); + pr_err("Len: %zu buflen: %zu strlen: %zu\n", + len, buflen, strnlen(buf, sizeof(buf))); + pr_err("Result: %d '%s'\n", r, buf); + pr_err("Expect: %d '%s'\n", e, test); } } -- cgit v0.10.2 From 1dacd9ddd359eed63b210bd9b5000c2cfae287ff Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:59:04 -0800 Subject: test_hexdump: test all possible group sizes for overflow Currently the only one combination is tested for overflow, i.e. rowsize = 16, groupsize = 1, len = 1. Do various test to go through all possible branches. Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 16a7593..11d45f5 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -133,7 +133,9 @@ static void __init test_hexdump_set(int rowsize, bool ascii) test_hexdump(len, rowsize, 1, ascii); } -static void __init test_hexdump_overflow(size_t buflen, bool ascii) +static void __init test_hexdump_overflow(size_t buflen, size_t len, + int rowsize, int groupsize, + bool ascii) { char test[TEST_HEXDUMP_BUF_SIZE]; char buf[TEST_HEXDUMP_BUF_SIZE]; @@ -176,6 +178,19 @@ static void __init test_hexdump_overflow(size_t buflen, bool ascii) } } +static void __init test_hexdump_overflow_set(size_t buflen, bool ascii) +{ + unsigned int i = 0; + int rs = (get_random_int() % 2 + 1) * 16; + + do { + int gs = 1 << i; + size_t len = get_random_int() % rs + gs; + + test_hexdump_overflow(buflen, rounddown(len, gs), rs, gs, ascii); + } while (i++ < 3); +} + static int __init test_hexdump_init(void) { unsigned int i; @@ -192,10 +207,10 @@ static int __init test_hexdump_init(void) test_hexdump_set(rowsize, true); for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++) - test_hexdump_overflow(i, false); + test_hexdump_overflow_set(i, false); for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++) - test_hexdump_overflow(i, true); + test_hexdump_overflow_set(i, true); return -EINVAL; } -- cgit v0.10.2 From 7aaf4c3e1235cca77dcc1c5a0848687e7d26a42f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 20 Jan 2016 14:59:07 -0800 Subject: test_hexdump: print statistics at the end Like others test are doing print the gathered statistics after test module is finished. Return from the module based on the result. Signed-off-by: Andy Shevchenko Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 11d45f5..3f415d8 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -44,6 +44,9 @@ static const char * const test_data_8_le[] __initconst = { #define FILL_CHAR '#' +static unsigned total_tests __initdata; +static unsigned failed_tests __initdata; + static void __init test_hexdump_prepare_test(size_t len, int rowsize, int groupsize, char *test, size_t testlen, bool ascii) @@ -107,6 +110,8 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, char test[TEST_HEXDUMP_BUF_SIZE]; char real[TEST_HEXDUMP_BUF_SIZE]; + total_tests++; + memset(real, FILL_CHAR, sizeof(real)); hex_dump_to_buffer(data_b, len, rowsize, groupsize, real, sizeof(real), ascii); @@ -119,6 +124,7 @@ static void __init test_hexdump(size_t len, int rowsize, int groupsize, pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); pr_err("Result: '%s'\n", real); pr_err("Expect: '%s'\n", test); + failed_tests++; } } @@ -143,6 +149,8 @@ static void __init test_hexdump_overflow(size_t buflen, size_t len, int ae, he, e, f, r; bool a; + total_tests++; + memset(buf, FILL_CHAR, sizeof(buf)); r = hex_dump_to_buffer(data_b, len, rs, gs, buf, buflen, ascii); @@ -175,6 +183,7 @@ static void __init test_hexdump_overflow(size_t buflen, size_t len, len, buflen, strnlen(buf, sizeof(buf))); pr_err("Result: %d '%s'\n", r, buf); pr_err("Expect: %d '%s'\n", e, test); + failed_tests++; } } @@ -196,8 +205,6 @@ static int __init test_hexdump_init(void) unsigned int i; int rowsize; - pr_info("Running tests...\n"); - rowsize = (get_random_int() % 2 + 1) * 16; for (i = 0; i < 16; i++) test_hexdump_set(rowsize, false); @@ -212,7 +219,20 @@ static int __init test_hexdump_init(void) for (i = 0; i <= TEST_HEXDUMP_BUF_SIZE; i++) test_hexdump_overflow_set(i, true); - return -EINVAL; + if (failed_tests == 0) + pr_info("all %u tests passed\n", total_tests); + else + pr_err("failed %u out of %u tests\n", failed_tests, total_tests); + + return failed_tests ? -EINVAL : 0; } module_init(test_hexdump_init); + +static void __exit test_hexdump_exit(void) +{ + /* do nothing */ +} +module_exit(test_hexdump_exit); + +MODULE_AUTHOR("Andy Shevchenko "); MODULE_LICENSE("Dual BSD/GPL"); -- cgit v0.10.2 From 243c2137cda52599f6112f52b6be5e61fa6536ae Mon Sep 17 00:00:00 2001 From: Adam Barth Date: Wed, 20 Jan 2016 14:59:09 -0800 Subject: include/linux/radix-tree.h: fix error in docs about locks This text refers to the "first 7 functions", which was correct when written but became incorrect when Johannes Weiner added another function to the list in 139e561660fe ("lib: radix_tree: tree node interface"). Change the text to correctly refer to the first 8 functions. Signed-off-by: Adam Barth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 33170db..57e7d87 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -154,7 +154,7 @@ do { \ * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * - * The first 7 functions are able to be called locklessly, using RCU. The + * The first 8 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. -- cgit v0.10.2 From f5948701891322770ad6ede317da5fc9cf33d2f0 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 20 Jan 2016 14:59:12 -0800 Subject: lib/clz_tab.c: put in lib-y rather than obj-y The clz table (__clz_tab) in lib/clz_tab.c is also provided as part of libgcc.a, and many architectures link against libgcc. To allow the linker to avoid a multiple-definition link failure, clz_tab.o has to be in lib/lib.a rather than lib/builtin.o. The specific issue is that libgcc.a comes before lib/builtin.o on vmlinux.o's link command line, so its _clz.o is pulled to satisfy __clz_tab, and then when the remainder of lib/builtin.o is pulled in to satisfy all the other dependencies, the __clz_tab symbols conflict. By putting clz_tab.o in lib.a, the linker can simply avoid pulling it into vmlinux.o when this situation arises. The definitions of __clz_tab are the same in libgcc.a and in the kernel; arguably we could also simply rename the kernel version, but it's unlikely the libgcc version will ever change to become incompatible, so just using it seems reasonably safe. Signed-off-by: Chris Metcalf Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Makefile b/lib/Makefile index fdeb7e3..b2a82e6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -154,7 +154,7 @@ obj-$(CONFIG_GLOB) += glob.o obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o -obj-$(CONFIG_CLZ_TAB) += clz_tab.o +lib-$(CONFIG_CLZ_TAB) += clz_tab.o obj-$(CONFIG_DDR) += jedec_ddr_data.o -- cgit v0.10.2 From 938224b5e596c1c30d968ffd927a578ea7c4f45b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 20 Jan 2016 14:59:15 -0800 Subject: checkpatch: warn when casting constants to c90 int or longer types Linus Torvalds wrote: > I can't but help to react that this: > #define IOMMU_ERROR_CODE (~(unsigned long) 0) > Not that this *matters*, but it's a bit odd to have to cast constants > to perfectly regular C types. So add a test that looks for constants that are cast to standard C90 int or longer types and suggest using C90 "6.4.4.1 Integer constants" integer-suffixes instead. Miscellanea: o Add a --fix option too Signed-off-by: Joe Perches Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c7bf1aa..8645706 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -433,6 +433,28 @@ our @typeList = ( qr{${Ident}_handler_fn}, @typeListMisordered, ); + +our $C90_int_types = qr{(?x: + long\s+long\s+int\s+(?:un)?signed| + long\s+long\s+(?:un)?signed\s+int| + long\s+long\s+(?:un)?signed| + (?:(?:un)?signed\s+)?long\s+long\s+int| + (?:(?:un)?signed\s+)?long\s+long| + int\s+long\s+long\s+(?:un)?signed| + int\s+(?:(?:un)?signed\s+)?long\s+long| + + long\s+int\s+(?:un)?signed| + long\s+(?:un)?signed\s+int| + long\s+(?:un)?signed| + (?:(?:un)?signed\s+)?long\s+int| + (?:(?:un)?signed\s+)?long| + int\s+long\s+(?:un)?signed| + int\s+(?:(?:un)?signed\s+)?long| + + int\s+(?:un)?signed| + (?:(?:un)?signed\s+)?int +)}; + our @typeListFile = (); our @typeListWithAttr = ( @typeList, @@ -5272,6 +5294,26 @@ sub process { } } +# check for cast of C90 native int or longer types constants + if ($line =~ /(\(\s*$C90_int_types\s*\)\s*)($Constant)\b/) { + my $cast = $1; + my $const = $2; + if (WARN("TYPECAST_INT_CONSTANT", + "Unnecessary typecast of c90 int constant\n" . $herecurr) && + $fix) { + my $suffix = ""; + my $newconst = $const; + $newconst =~ s/${Int_type}$//; + $suffix .= 'U' if ($cast =~ /\bunsigned\b/); + if ($cast =~ /\blong\s+long\b/) { + $suffix .= 'LL'; + } elsif ($cast =~ /\blong\b/) { + $suffix .= 'L'; + } + $fixed[$fixlinenr] =~ s/\Q$cast\E$const\b/$newconst$suffix/; + } + } + # check for sizeof(&) if ($line =~ /\bsizeof\s*\(\s*\&/) { WARN("SIZEOF_ADDRESS", -- cgit v0.10.2 From 62e15a6daab0f6b3e8233e976201bce9faecaaf7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 20 Jan 2016 14:59:18 -0800 Subject: checkpatch: improve macros with flow control test The current test excludes any macro with ## concatenation from being reported with hidden flow control. Some macros are used with return or goto statements along with ##args or ##__VA_ARGS__. A somewhat common case is a logging macro like pr_info(fmt, ...) then a return or goto statement. Check the concatenated variable for args or __VA_ARGS__ and allow those macros to also be reported when they contain a return or goto. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8645706..77b293d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4539,7 +4539,7 @@ sub process { #print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n"; $has_flow_statement = 1 if ($ctx =~ /\b(goto|return)\b/); - $has_arg_concat = 1 if ($ctx =~ /\#\#/); + $has_arg_concat = 1 if ($ctx =~ /\#\#/ && $ctx !~ /\#\#\s*(?:__VA_ARGS__|args)\b/); $dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//; $dstat =~ s/$;//g; -- cgit v0.10.2 From 6b10df4257367dd0ead49f88df473972c00a8b5c Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Wed, 20 Jan 2016 14:59:21 -0800 Subject: checkpatch: fix a number of COMPLEX_MACRO false positives A simple search over the kernel souce displays a number of correctly defined multiline macro, which generally are used as an array element initializer: % find ../linux -type f | xargs grep -B1 -H '^[:space]*\[.*\\$' However checkpatch.pl unexpectedly complains about all these macro definitions: % ./scripts/checkpatch.pl --types COMPLEX_MACRO -f include/linux/perf/arm_pmu.h ERROR: Macros with complex values should be enclosed in parentheses +#define PERF_MAP_ALL_UNSUPPORTED \ + [0 ... PERF_COUNT_HW_MAX - 1] = HW_OP_UNSUPPORTED The change intends to fix this type of false positives by flattening only array members and skipping array element designators. Signed-off-by: Vladimir Zapolskiy Acked-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 77b293d..0147c91 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4550,7 +4550,7 @@ sub process { # Flatten any parentheses and braces while ($dstat =~ s/\([^\(\)]*\)/1/ || $dstat =~ s/\{[^\{\}]*\}/1/ || - $dstat =~ s/\[[^\[\]]*\]/1/) + $dstat =~ s/.\[[^\[\]]*\]/1/) { } @@ -4570,7 +4570,8 @@ sub process { union| struct| \.$Ident\s*=\s*| - ^\"|\"$ + ^\"|\"$| + ^\[ }x; #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; if ($dstat ne '' && -- cgit v0.10.2 From df0108c5da561c66c333bb46bfe3c1fc65905898 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 20 Jan 2016 14:59:24 -0800 Subject: epoll: add EPOLLEXCLUSIVE flag Currently, epoll file descriptors or epfds (the fd returned from epoll_create[1]()) that are added to a shared wakeup source are always added in a non-exclusive manner. This means that when we have multiple epfds attached to a shared fd source they are all woken up. This creates thundering herd type behavior. Introduce a new 'EPOLLEXCLUSIVE' flag that can be passed as part of the 'event' argument during an epoll_ctl() EPOLL_CTL_ADD operation. This new flag allows for exclusive wakeups when there are multiple epfds attached to a shared fd event source. The implementation walks the list of exclusive waiters, and queues an event to each epfd, until it finds the first waiter that has threads blocked on it via epoll_wait(). The idea is to search for threads which are idle and ready to process the wakeup events. Thus, we queue an event to at least 1 epfd, but may still potentially queue an event to all epfds that are attached to the shared fd source. Performance testing was done by Madars Vitolins using a modified version of Enduro/X. The use of the 'EPOLLEXCLUSIVE' flag reduce the length of this particular workload from 860s down to 24s. Sample epoll_clt text: EPOLLEXCLUSIVE Sets an exclusive wakeup mode for the epfd file descriptor that is being attached to the target file descriptor, fd. Thus, when an event occurs and multiple epfd file descriptors are attached to the same target file using EPOLLEXCLUSIVE, one or more epfds will receive an event with epoll_wait(2). The default in this scenario (when EPOLLEXCLUSIVE is not set) is for all epfds to receive an event. EPOLLEXCLUSIVE may only be specified with the op EPOLL_CTL_ADD. Signed-off-by: Jason Baron Tested-by: Madars Vitolins Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Al Viro Cc: Michael Kerrisk Cc: Eric Wong Cc: Jonathan Corbet Cc: Andy Lutomirski Cc: Hagen Paul Pfeifer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009ca..ae1dbcf 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -92,7 +92,7 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1002,6 +1002,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + int ewake = 0; if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; @@ -1066,8 +1067,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ - if (waitqueue_active(&ep->wq)) + if (waitqueue_active(&ep->wq)) { + ewake = 1; wake_up_locked(&ep->wq); + } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1078,6 +1081,9 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); + if (epi->event.events & EPOLLEXCLUSIVE) + return ewake; + return 1; } @@ -1095,7 +1101,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; - add_wait_queue(whead, &pwq->wait); + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { @@ -1862,6 +1871,15 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* + * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, + * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. + * Also, we do not currently supported nested exclusive wakeups. + */ + if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || + (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) + goto error_tgt_fput; + + /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2..1c31549 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,9 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Set exclusive wakeup mode for the target file descriptor */ +#define EPOLLEXCLUSIVE (1 << 28) + /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. -- cgit v0.10.2 From 31c025b5fece8d0fdc88920065fbc1ff7e4a78b1 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 20 Jan 2016 14:59:27 -0800 Subject: init/main.c: obsolete_checksetup can be boolean Make obsolete_checksetup() return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/main.c b/init/main.c index c6ebefa..58c9e37 100644 --- a/init/main.c +++ b/init/main.c @@ -164,10 +164,10 @@ static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; -static int __init obsolete_checksetup(char *line) +static bool __init obsolete_checksetup(char *line) { const struct obs_kernel_param *p; - int had_early_param = 0; + bool had_early_param = false; p = __setup_start; do { @@ -179,13 +179,13 @@ static int __init obsolete_checksetup(char *line) * Keep iterating, as we can have early * params and __setups of same names 8( */ if (line[n] == '\0' || line[n] == '=') - had_early_param = 1; + had_early_param = true; } else if (!p->setup_func) { pr_warn("Parameter %s is obsolete, ignored\n", p->str); - return 1; + return true; } else if (p->setup_func(line + n)) - return 1; + return true; } p++; } while (p < __setup_end); -- cgit v0.10.2 From f057f3b226a5c513aafaa5ece94f3a7f363215c5 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 20 Jan 2016 14:59:29 -0800 Subject: init/do_mounts: initrd_load() can be boolean Make initrd_load() return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/do_mounts.h b/init/do_mounts.h index f5b978a..067af1d 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -57,11 +57,11 @@ static inline int rd_load_image(char *from) { return 0; } #ifdef CONFIG_BLK_DEV_INITRD -int __init initrd_load(void); +bool __init initrd_load(void); #else -static inline int initrd_load(void) { return 0; } +static inline bool initrd_load(void) { return false; } #endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 3e0878e..a1000ca 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -116,7 +116,7 @@ static void __init handle_initrd(void) } } -int __init initrd_load(void) +bool __init initrd_load(void) { if (mount_initrd) { create_dev("/dev/ram", Root_RAM0); @@ -129,9 +129,9 @@ int __init initrd_load(void) if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { sys_unlink("/initrd.image"); handle_initrd(); - return 1; + return true; } } sys_unlink("/initrd.image"); - return 0; + return false; } -- cgit v0.10.2 From 2c35dea279351f2ad74a99126b29f9f5394ccc04 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 14:59:32 -0800 Subject: fs/hfs/catalog.c: use list_for_each_entry in hfs_cat_delete Use list_for_each_entry() instead of list_for_each() to simplify the code. Signed-off-by: Geliang Tang Reviewed-by: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index db458ee..1eb5d41 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) { struct super_block *sb; struct hfs_find_data fd; - struct list_head *pos; + struct hfs_readdir_data *rd; int res, type; hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); @@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) } } - list_for_each(pos, &HFS_I(dir)->open_dir_list) { - struct hfs_readdir_data *rd = - list_entry(pos, struct hfs_readdir_data, list); + list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) { if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) rd->file->f_pos--; } -- cgit v0.10.2 From a513d86983164a1f74a226ab7006deffbf63907e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 Jan 2016 14:59:35 -0800 Subject: fat: allow time_offset to be up to 24 hours Currently we limit values of time_offset mount option to be between -12 and 12 hours. However e.g. zone GMT+12 can have a DST correction on top which makes the total time difference 13 hours. Update the checks in mount option parsing to allow offset of upto 24 hours to allow for unusual cases. Signed-off-by: Jan Kara Reported-by: Volker Kuhlmann Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6aece96..3ac9078 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1146,7 +1146,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_time_offset: if (match_int(&args[0], &option)) return -EINVAL; - if (option < -12 * 60 || option > 12 * 60) + /* + * GMT+-12 zones may have DST corrections so at least + * 13 hours difference is needed. Make the limit 24 + * just in case someone invents something unusual. + */ + if (option < -24 * 60 || option > 24 * 60) return -EINVAL; opts->tz_set = 1; opts->time_offset = option; -- cgit v0.10.2 From a3082d526f2d406d4b488e49a508a0062a23314f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 20 Jan 2016 14:59:38 -0800 Subject: fat: add simple validation for directory inode This detects simple corruption cases of directory, and tries to avoid further damage to user data. And performance impact of this validation should be very low, or not measurable. Signed-off-by: OGAWA Hirofumi Reported-by: Vegard Nossum Tested-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3ac9078..08ef5fd 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -449,6 +449,24 @@ static int fat_calc_dir_size(struct inode *inode) return 0; } +static int fat_validate_dir(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + + if (dir->i_nlink < 2) { + /* Directory should have "."/".." entries at least. */ + fat_fs_error(sb, "corrupted directory (invalid entries)"); + return -EIO; + } + if (MSDOS_I(dir)->i_start == 0 || + MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) { + /* Directory should point valid cluster. */ + fat_fs_error(sb, "corrupted directory (invalid i_start)"); + return -EIO; + } + return 0; +} + /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { @@ -475,6 +493,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); + + error = fat_validate_dir(inode); + if (error < 0) + return error; } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, -- cgit v0.10.2 From b13bb33eacb7266d66a3adf03adaa0886d091789 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Jan 2016 14:59:41 -0800 Subject: fat: add fat_fallocate operation Implement preallocation via the fallocate syscall on VFAT partitions. This patch is based on an earlier patch of the same name which had some issues detailed below and did not get accepted. Refer https://lkml.org/lkml/2007/12/22/130. a) The preallocated space was not persistent when the FALLOC_FL_KEEP_SIZE flag was set. It will deallocate cluster at evict time. b) There was no need to zero out the clusters when the flag was set Instead of doing an expanding truncate, just allocate clusters and add them to the fat chain. This reduces preallocation time. Compatibility with windows: There are no issues when FALLOC_FL_KEEP_SIZE is not set because it just does an expanding truncate. Thus reading from the preallocated area on windows returns null until data is written to it. When a file with preallocated area using the FALLOC_FL_KEEP_SIZE was written to on windows, the windows driver freed-up the preallocated clusters and allocated new clusters for the new data. The freed up clusters gets reflected in the free space available for the partition which can be seen from the Volume properties. The windows chkdsk tool also does not report any errors on a disk containing files with preallocated space. And there is also no issue using linux fat fsck. because discard preallocated clusters at repair time. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e153..eed04c0 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -384,6 +384,7 @@ static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } +extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold diff --git a/fs/fat/file.c b/fs/fat/file.c index a08f103..43d3475 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -14,8 +14,12 @@ #include #include #include +#include #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; @@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -215,6 +220,62 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + loff_t ondisksize; /* block aligned on-disk size in bytes*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_KEEP_SIZE) { + ondisksize = inode->i_blocks << 9; + if ((offset + len) <= ondisksize) + goto error; + + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - ondisksize; + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_add_cluster(inode); + if (err) + goto error; + } + } else { + if ((offset + len) <= i_size_read(inode)) + goto error; + + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + } + +error: + mutex_unlock(&inode->i_mutex); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 08ef5fd..a6d41fb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -93,7 +93,7 @@ static struct fat_floppy_defaults { }, }; -static int fat_add_cluster(struct inode *inode) +int fat_add_cluster(struct inode *inode) { int err, cluster; @@ -575,13 +575,43 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); + +static void fat_free_eofblocks(struct inode *inode) +{ + /* Release unwritten fallocated blocks on inode eviction. */ + if ((inode->i_blocks << 9) > + round_up(MSDOS_I(inode)->mmu_private, + MSDOS_SB(inode->i_sb)->cluster_size)) { + int err; + + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused " + "fallocated blocks, inode could be " + "corrupted. Please run fsck"); + } + + } +} + static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); - } + } else + fat_free_eofblocks(inode); + invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); -- cgit v0.10.2 From 7e0f236b5b9cc23aa004eb58ee2201f294d0422a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Jan 2016 14:59:43 -0800 Subject: fat: skip cluster allocation on fallocated region Skip new cluster allocation after checking i_blocks limit in _fat_get_block, because the blocks are already allocated in fallocated region. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a6d41fb..0e5bc19 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -115,7 +115,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; - sector_t phys; + sector_t phys, last_block; int err, offset; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); @@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return -EIO; } + last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); - if (!offset) { + /* + * allocate a cluster according to the following. + * 1) no more available blocks + * 2) not part of fallocate region + */ + if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) -- cgit v0.10.2 From 16fab2015099a937e1a771f2785c5dd3445fe483 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Jan 2016 14:59:46 -0800 Subject: fat: permit to return phy block number by fibmap in fallocated region Make the fibmap call return the proper physical block number for any offset request in the fallocated range. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc622..5d38492 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create) +int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + +static int is_exceed_eof(struct inode *inode, sector_t sector, + sector_t *last_block, int create) +{ + struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; + + *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= *last_block) { + if (!create) + return 1; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= *last_block) + return 1; + } + + return 0; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create, bool from_bmap) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; } - last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; - if (sector >= last_block) { - if (!create) + if (!from_bmap) { + if (is_exceed_eof(inode, sector, &last_block, create)) return 0; - - /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) - */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) - >> blocksize_bits; + } else { + last_block = inode->i_blocks >> + (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8b2127f..7def96c 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -91,7 +91,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ diff --git a/fs/fat/fat.h b/fs/fat/fat.h index eed04c0..4307cd4 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); +extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create); + unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 0e5bc19..a559905 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -118,7 +118,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, sector_t phys, last_block; int err, offset; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { @@ -154,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; @@ -279,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + BUG_ON(create != 0); + + err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; -- cgit v0.10.2 From 28016128d37a46d89ac5d9a450709284148989d6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Jan 2016 14:59:49 -0800 Subject: Documentation/filesystems/vfat.txt: update the limitation for fat fallocate Update the limitation for fat fallocate. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Cc: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index ce1126a..223c321 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block : 0,1,yes,no,true,false +LIMITATION +--------------------------------------------------------------------- +* The fallocated region of file is discarded at umount/evict time + when using fallocate with FALLOC_FL_KEEP_SIZE. + So, User should assume that fallocated region can be discarded at + last close if there is memory pressure resulting in eviction of + the inode from the memory. As a result, for any dependency on + the fallocated region, user should make sure to recheck fallocate + after reopening the file. + TODO ---------------------------------------------------------------------- * Need to get rid of the raw scanning stuff. Instead, always use -- cgit v0.10.2 From 8992de4cec126c6703ece0747239d071dbce725f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 20 Jan 2016 14:59:52 -0800 Subject: fat: constify fatent_operations structures The fatent_operations structures are never modified, so declare them as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 4307cd4..e6b764a 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -87,7 +87,7 @@ struct msdos_sb_info { unsigned int vol_id; /*volume ID*/ int fatent_shift; - struct fatent_operations *fatent_ops; + const struct fatent_operations *fatent_ops; struct inode *fat_inode; struct inode *fsinfo_inode; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 8226557..1d9a8c4 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -99,7 +99,7 @@ err: static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; @@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent) return 0; } -static struct fatent_operations fat12_ops = { +static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, @@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = { .ent_next = fat12_ent_next, }; -static struct fatent_operations fat16_ops = { +static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = { .ent_next = fat16_ent_next, }; -static struct fatent_operations fat32_ops = { +static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ @@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; @@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); @@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi, static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; @@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; @@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; @@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters); static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, unsigned long reada_blocks) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int i, offset; @@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; unsigned long reada_blocks, reada_mask, cur_block; int err = 0, free; -- cgit v0.10.2 From 7c3b00e06d731a28fc3d17ed02ba250642b15b81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jan 2016 14:59:55 -0800 Subject: ptrace: make wait_on_bit(JOBCTL_TRAPPING_BIT) in ptrace_attach() killable ptrace_attach() can hang waiting for STOPPED -> TRACED transition if the tracee gets frozen in between, change wait_on_bit() to use TASK_KILLABLE. This doesn't really solve the problem(s) and we probably need to fix the freezer. In particular, note that this means that pm freezer will fail if it races attach-to-stopped-task. And otoh perhaps we can just remove JOBCTL_TRAPPING_BIT altogether, it is not clear if we really need to hide this transition from debugger, WNOHANG after PTRACE_ATTACH can fail anyway if it races with SIGCONT. Signed-off-by: Oleg Nesterov Reported-by: Andrey Ryabinin Cc: Roland McGrath Acked-by: Tejun Heo Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae..aa94aee 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -364,8 +364,14 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - TASK_UNINTERRUPTIBLE); + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); } -- cgit v0.10.2 From 570ac9337b5c13dbf46ca6758c376e2e13e8956f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jan 2016 14:59:58 -0800 Subject: ptrace: task_stopped_code(ptrace => true) can't see TASK_STOPPED task task_stopped_code()->task_is_stopped_or_traced() doesn't look right, the traced task must never be TASK_STOPPED. We can not add WARN_ON(task_is_stopped(p)), but this is only because do_wait() can race with PTRACE_ATTACH from another thread. [akpm@linux-foundation.org: teeny cleanup] Signed-off-by: Oleg Nesterov Cc: Andrey Ryabinin Cc: Roland McGrath Acked-by: Tejun Heo Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6..b0eea83 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1120,8 +1120,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) -- cgit v0.10.2 From 3dfb7d8cdbc7ea0c2970450e60818bb3eefbad69 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:01 -0800 Subject: security: let security modules use PTRACE_MODE_* with bitmasks It looks like smack and yama weren't aware that the ptrace mode can have flags ORed into it - PTRACE_MODE_NOAUDIT until now, but only for /proc/$pid/stat, and with the PTRACE_MODE_*CREDS patch, all modes have flags ORed into them. Signed-off-by: Jann Horn Acked-by: Kees Cook Acked-by: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8d85435..2d6e9bd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -398,12 +398,10 @@ static int smk_copy_relabel(struct list_head *nhead, struct list_head *ohead, */ static inline unsigned int smk_ptrace_mode(unsigned int mode) { - switch (mode) { - case PTRACE_MODE_READ: - return MAY_READ; - case PTRACE_MODE_ATTACH: + if (mode & PTRACE_MODE_ATTACH) return MAY_READWRITE; - } + if (mode & PTRACE_MODE_READ) + return MAY_READ; return 0; } diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index d3c19c9..cb6ed10 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -281,7 +281,7 @@ static int yama_ptrace_access_check(struct task_struct *child, int rc = 0; /* require ptrace target be a child of ptracer on attach */ - if (mode == PTRACE_MODE_ATTACH) { + if (mode & PTRACE_MODE_ATTACH) { switch (ptrace_scope) { case YAMA_SCOPE_DISABLED: /* No additional restrictions. */ @@ -307,7 +307,7 @@ static int yama_ptrace_access_check(struct task_struct *child, } } - if (rc) { + if (rc && (mode & PTRACE_MODE_NOAUDIT) == 0) { printk_ratelimited(KERN_NOTICE "ptrace of pid %d was attempted by: %s (pid %d)\n", child->pid, current->comm, current->pid); -- cgit v0.10.2 From caaee6234d05a58c5b4d05e7bf766131b810a657 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:04 -0800 Subject: ptrace: use fsuid, fsgid, effective creds for fs access checks By checking the effective credentials instead of the real UID / permitted capabilities, ensure that the calling process actually intended to use its credentials. To ensure that all ptrace checks use the correct caller credentials (e.g. in case out-of-tree code or newly added code omits the PTRACE_MODE_*CREDS flag), use two new flags and require one of them to be set. The problem was that when a privileged task had temporarily dropped its privileges, e.g. by calling setreuid(0, user_uid), with the intent to perform following syscalls with the credentials of a user, it still passed ptrace access checks that the user would not be able to pass. While an attacker should not be able to convince the privileged task to perform a ptrace() syscall, this is a problem because the ptrace access check is reused for things in procfs. In particular, the following somewhat interesting procfs entries only rely on ptrace access checks: /proc/$pid/stat - uses the check for determining whether pointers should be visible, useful for bypassing ASLR /proc/$pid/maps - also useful for bypassing ASLR /proc/$pid/cwd - useful for gaining access to restricted directories that contain files with lax permissions, e.g. in this scenario: lrwxrwxrwx root root /proc/13020/cwd -> /root/foobar drwx------ root root /root drwxr-xr-x root root /root/foobar -rw-r--r-- root root /root/foobar/secret Therefore, on a system where a root-owned mode 6755 binary changes its effective credentials as described and then dumps a user-specified file, this could be used by an attacker to reveal the memory layout of root's processes or reveal the contents of files he is not allowed to access (through /proc/$pid/cwd). [akpm@linux-foundation.org: fix warning] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Casey Schaufler Cc: Oleg Nesterov Cc: Ingo Molnar Cc: James Morris Cc: "Serge E. Hallyn" Cc: Andy Shevchenko Cc: Andy Lutomirski Cc: Al Viro Cc: "Eric W. Biederman" Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/array.c b/fs/proc/array.c index d73291f..b6c00ce 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2cf5d7e..e665097 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = { static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { @@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) + && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else seq_putc(m, '0'); @@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task) int err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) return err; - if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { mutex_unlock(&task->signal->cred_guard_mutex); return -EPERM; } @@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; @@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, return true; if (in_group_p(pid->pid_gid)) return true; - return ptrace_may_access(task, PTRACE_MODE_READ); + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { - mm = mm_access(task, mode); + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { @@ -1860,7 +1861,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (!task) goto out_notask; - mm = mm_access(task, PTRACE_MODE_READ); + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; @@ -2013,7 +2014,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = -ENOENT; @@ -2066,7 +2067,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; @@ -2533,7 +2534,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (result) return result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 1dece87..276f124 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -46,7 +46,7 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return error; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) nd_jump_link(&ns_path); @@ -67,7 +67,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 061265f..504c98a 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -57,7 +57,29 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -/* Returns true on success, false on denial. */ +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 + +/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ +#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) +#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) + +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must + * be set in @mode to specify whether the access was requested through + * a filesystem syscall (should use effective capabilities and fsuid + * of the caller) or through an explicit syscall such as + * process_vm_writev or ptrace (and should use the real credentials). + */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) diff --git a/kernel/events/core.c b/kernel/events/core.c index bf82441..c095741 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3376,7 +3376,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; diff --git a/kernel/futex.c b/kernel/futex.c index c6f5145..0773f2b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2884,7 +2884,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c93..4ae3232 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea..3a47fa9 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aa94aee..2341efe 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071..5d453e5 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/security/commoncap.c b/security/commoncap.c index 1832cf7..48071ed 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -137,12 +137,17 @@ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; + const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); + if (mode & PTRACE_MODE_FSCREDS) + caller_caps = &cred->cap_effective; + else + caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && - cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) + cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; -- cgit v0.10.2 From ac94b6e3ba858b8de1dfe3f77ad215af7b648545 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jan 2016 15:00:08 -0800 Subject: fs/coredump: prevent "" / "." / ".." core path components Let %h and %e print empty values as "!", "." as "!" and ".." as "!.". This prevents hostnames and comm values that are empty or consist of one or two dots from changing the directory level at which the corefile will be stored. Consider the case where someone decides to sort coredumps by hostname with a core pattern like "/cores/%h/core.%e.%p.%t" or so. In this case, hostnames "" and "." would cause the coredump to land directly in /cores, which is not what the intent behind the core pattern is, and ".." would cause the coredump to land in /. Yeah, there probably aren't many people who do that, but I still don't want this edgecase to be kind of broken. It seems very unlikely that this caused security issues anywhere, so I'm not requesting a stable backport. [akpm@linux-foundation.org: tweak code comment] Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/coredump.c b/fs/coredump.c index b3c153c..9ea87e9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -118,6 +118,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...) ret = cn_vprintf(cn, fmt, arg); va_end(arg); + if (ret == 0) { + /* + * Ensure that this coredump name component can't cause the + * resulting corefile path to consist of a ".." or ".". + */ + if ((cn->used - cur == 1 && cn->corename[cur] == '.') || + (cn->used - cur == 2 && cn->corename[cur] == '.' + && cn->corename[cur+1] == '.')) + cn->corename[cur] = '!'; + + /* + * Empty names are fishy and could be used to create a "//" in a + * corefile name, causing the coredump to happen one directory + * level too high. Enforce that all components of the core + * pattern are at least one character long. + */ + if (cn->used == cur) + ret = cn_printf(cn, "!"); + } + for (; cur < cn->used; ++cur) { if (cn->corename[cur] == '/') cn->corename[cur] = '!'; -- cgit v0.10.2 From c428fbdbf3e9515bfe686881ffdba862dbd8cb6f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 20 Jan 2016 15:00:10 -0800 Subject: exit: remove unneeded declaration of exit_mm() Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/exit.c b/kernel/exit.c index b0eea83..10e0882 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,8 +59,6 @@ #include #include -static void exit_mm(struct task_struct *tsk); - static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; -- cgit v0.10.2 From a0512164278b11deb3b07bf14e72f8b979b07aa6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:13 -0800 Subject: powerpc/fadump: rename cpu_online_mask member of struct fadump_crash_info_header The four cpumasks cpu_{possible,online,present,active}_bits are exposed readonly via the corresponding const variables cpu_xyz_mask. But they are also accessible for arbitrary writing via the exposed functions set_cpu_xyz. There's quite a bit of code throughout the kernel which iterates over or otherwise accesses these bitmaps, and having the access go via the cpu_xyz_mask variables is nowadays [1] simply a useless indirection. It may be that any problem in CS can be solved by an extra level of indirection, but that doesn't mean every extra indirection solves a problem. In this case, it even necessitates some minor ugliness (see 4/6). Patch 1/6 is new in v2, and fixes a build failure on ppc by renaming a struct member, to avoid problems when the identifier cpu_online_mask becomes a macro later in the series. The next four patches eliminate the cpu_xyz_mask variables by simply exposing the actual bitmaps, after renaming them to discourage direct access - that still happens through cpu_xyz_mask, which are now simply macros with the same type and value as they used to have. After that, there's no longer any reason to have the setter functions be out-of-line: The boolean parameter is almost always a literal true or false, so by making them static inlines they will usually compile to one or two instructions. For a defconfig build on x86_64, bloat-o-meter says we save ~3000 bytes. We also save a little stack (stackdelta says 127 functions have a 16 byte smaller stack frame, while two grow by that amount). Mostly because, when iterating over the mask, gcc typically loads the value of cpu_xyz_mask into a callee-saved register and from there into %rdi before each find_next_bit call - now it can just load the appropriate immediate address into %rdi before each call. [1] See Rusty's kind explanation http://thread.gmane.org/gmane.linux.kernel/2047078/focus=2047722 for some historic context. This patch (of 6): As preparation for eliminating the indirect access to the various global cpu_*_bits bitmaps via the pointer variables cpu_*_mask, rename the cpu_online_mask member of struct fadump_crash_info_header to simply online_mask, thus allowing cpu_online_mask to become a macro. Signed-off-by: Rasmus Villemoes Acked-by: Michael Ellerman Cc: Greg Kroah-Hartman Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 493e72f..b4407d0 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -191,7 +191,7 @@ struct fadump_crash_info_header { u64 elfcorehdr_addr; u32 crashing_cpu; struct pt_regs regs; - struct cpumask cpu_online_mask; + struct cpumask online_mask; }; /* Crash memory ranges */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 26d091a..3cb3b02a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -415,7 +415,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) else ppc_save_regs(&fdh->regs); - fdh->cpu_online_mask = *cpu_online_mask; + fdh->online_mask = *cpu_online_mask; /* Call ibm,os-term rtas call to trigger firmware assisted dump */ rtas_os_term((char *)str); @@ -646,7 +646,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) } /* Lower 4 bytes of reg_value contains logical cpu id */ cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK; - if (fdh && !cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { SKIP_TO_NEXT_CPU(reg_entry); continue; } -- cgit v0.10.2 From c4c54dd1caf1393c529e7ea1f18b4342c796a49c Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:16 -0800 Subject: kernel/cpu.c: change type of cpu_possible_bits and friends Change cpu_possible_bits and friends (online, present, active) from being bitmaps that happen to have the right size to actually being struct cpumasks. Also rename them to __cpu_xyz_mask. This is mostly a small cleanup in preparation for exporting them and, eventually, eliminating the extra indirection through the cpu_xyz_mask variables. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e2..6a96b71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,71 +759,71 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; +static struct cpumask __cpu_possible_mask __read_mostly + = {CPU_BITS_ALL}; #else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +static struct cpumask __cpu_possible_mask __read_mostly; #endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; EXPORT_SYMBOL(cpu_possible_mask); -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +static struct cpumask __cpu_online_mask __read_mostly; +const struct cpumask *const cpu_online_mask = &__cpu_online_mask; EXPORT_SYMBOL(cpu_online_mask); -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +static struct cpumask __cpu_present_mask __read_mostly; +const struct cpumask *const cpu_present_mask = &__cpu_present_mask; EXPORT_SYMBOL(cpu_present_mask); -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +static struct cpumask __cpu_active_mask __read_mostly; +const struct cpumask *const cpu_active_mask = &__cpu_active_mask; EXPORT_SYMBOL(cpu_active_mask); void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + cpumask_set_cpu(cpu, &__cpu_possible_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); + cpumask_clear_cpu(cpu, &__cpu_possible_mask); } void set_cpu_present(unsigned int cpu, bool present) { if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + cpumask_set_cpu(cpu, &__cpu_present_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); + cpumask_clear_cpu(cpu, &__cpu_present_mask); } void set_cpu_online(unsigned int cpu, bool online) { if (online) { - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_set_cpu(cpu, &__cpu_online_mask); + cpumask_set_cpu(cpu, &__cpu_active_mask); } else { - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + cpumask_clear_cpu(cpu, &__cpu_online_mask); } } void set_cpu_active(unsigned int cpu, bool active) { if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_set_cpu(cpu, &__cpu_active_mask); else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_clear_cpu(cpu, &__cpu_active_mask); } void init_cpu_present(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_present_bits), src); + cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_possible_bits), src); + cpumask_copy(&__cpu_possible_mask, src); } void init_cpu_online(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_online_bits), src); + cpumask_copy(&__cpu_online_mask, src); } -- cgit v0.10.2 From 4b804c85dc37db6c108832b28cd54673ff7ee037 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:19 -0800 Subject: kernel/cpu.c: export __cpu_*_mask Exporting the cpumasks __cpu_possible_mask and friends will allow us to remove the extra indirection through the cpu_*_mask variables. It will also allow the set_cpu_* functions to become static inlines, which will give a .text reduction. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea..d4545a1 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -89,6 +89,10 @@ extern const struct cpumask *const cpu_possible_mask; extern const struct cpumask *const cpu_online_mask; extern const struct cpumask *const cpu_present_mask; extern const struct cpumask *const cpu_active_mask; +extern struct cpumask __cpu_possible_mask; +extern struct cpumask __cpu_online_mask; +extern struct cpumask __cpu_present_mask; +extern struct cpumask __cpu_active_mask; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a96b71..35d1d45 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,23 +759,27 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static struct cpumask __cpu_possible_mask __read_mostly +struct cpumask __cpu_possible_mask __read_mostly = {CPU_BITS_ALL}; #else -static struct cpumask __cpu_possible_mask __read_mostly; +struct cpumask __cpu_possible_mask __read_mostly; #endif +EXPORT_SYMBOL(__cpu_possible_mask); const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; EXPORT_SYMBOL(cpu_possible_mask); -static struct cpumask __cpu_online_mask __read_mostly; +struct cpumask __cpu_online_mask __read_mostly; +EXPORT_SYMBOL(__cpu_online_mask); const struct cpumask *const cpu_online_mask = &__cpu_online_mask; EXPORT_SYMBOL(cpu_online_mask); -static struct cpumask __cpu_present_mask __read_mostly; +struct cpumask __cpu_present_mask __read_mostly; +EXPORT_SYMBOL(__cpu_present_mask); const struct cpumask *const cpu_present_mask = &__cpu_present_mask; EXPORT_SYMBOL(cpu_present_mask); -static struct cpumask __cpu_active_mask __read_mostly; +struct cpumask __cpu_active_mask __read_mostly; +EXPORT_SYMBOL(__cpu_active_mask); const struct cpumask *const cpu_active_mask = &__cpu_active_mask; EXPORT_SYMBOL(cpu_active_mask); -- cgit v0.10.2 From 848e239155a17c5373e52278ff9a13b29867ea8a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:22 -0800 Subject: drivers/base/cpu.c: use __cpu_*_mask directly The only user of the lvalue-ness of the cpu_*_mask variables is in drivers/base/cpu.c, and that is mostly a work-around for the fact that not even const variables can be used in static initialization. Now that the underlying struct cpumasks are exposed we can take their address. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Acked-by: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 91bbb19..691eeea 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -200,7 +200,7 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = { struct cpu_attr { struct device_attribute attr; - const struct cpumask *const * const map; + const struct cpumask *const map; }; static ssize_t show_cpus_attr(struct device *dev, @@ -209,7 +209,7 @@ static ssize_t show_cpus_attr(struct device *dev, { struct cpu_attr *ca = container_of(attr, struct cpu_attr, attr); - return cpumap_print_to_pagebuf(true, buf, *ca->map); + return cpumap_print_to_pagebuf(true, buf, ca->map); } #define _CPU_ATTR(name, map) \ @@ -217,9 +217,9 @@ static ssize_t show_cpus_attr(struct device *dev, /* Keep in sync with cpu_subsys_attrs */ static struct cpu_attr cpu_attrs[] = { - _CPU_ATTR(online, &cpu_online_mask), - _CPU_ATTR(possible, &cpu_possible_mask), - _CPU_ATTR(present, &cpu_present_mask), + _CPU_ATTR(online, &__cpu_online_mask), + _CPU_ATTR(possible, &__cpu_possible_mask), + _CPU_ATTR(present, &__cpu_present_mask), }; /* -- cgit v0.10.2 From 5aec01b834fd6f8ca49d1aeede665b950d0c148e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:25 -0800 Subject: kernel/cpu.c: eliminate cpu_*_mask Replace the variables cpu_possible_mask, cpu_online_mask, cpu_present_mask and cpu_active_mask with macros expanding to expressions of the same type and value, eliminating some indirection. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d4545a1..52ab539 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -85,14 +85,14 @@ extern int nr_cpu_ids; * only one CPU. */ -extern const struct cpumask *const cpu_possible_mask; -extern const struct cpumask *const cpu_online_mask; -extern const struct cpumask *const cpu_present_mask; -extern const struct cpumask *const cpu_active_mask; extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) +#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) +#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) +#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(cpu_online_mask) diff --git a/kernel/cpu.c b/kernel/cpu.c index 35d1d45..8734fc7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -765,23 +765,15 @@ struct cpumask __cpu_possible_mask __read_mostly struct cpumask __cpu_possible_mask __read_mostly; #endif EXPORT_SYMBOL(__cpu_possible_mask); -const struct cpumask *const cpu_possible_mask = &__cpu_possible_mask; -EXPORT_SYMBOL(cpu_possible_mask); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); -const struct cpumask *const cpu_online_mask = &__cpu_online_mask; -EXPORT_SYMBOL(cpu_online_mask); struct cpumask __cpu_present_mask __read_mostly; EXPORT_SYMBOL(__cpu_present_mask); -const struct cpumask *const cpu_present_mask = &__cpu_present_mask; -EXPORT_SYMBOL(cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); -const struct cpumask *const cpu_active_mask = &__cpu_active_mask; -EXPORT_SYMBOL(cpu_active_mask); void set_cpu_possible(unsigned int cpu, bool possible) { -- cgit v0.10.2 From 9425676a363c0976e3d43dda792dc4711a651d1d Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Jan 2016 15:00:28 -0800 Subject: kernel/cpu.c: make set_cpu_* static inlines Almost all callers of the set_cpu_* functions pass an explicit true or false. Making them static inline thus replaces the function calls with a simple set_bit/clear_bit, saving some .text. Signed-off-by: Rasmus Villemoes Acked-by: Rusty Russell Cc: Greg Kroah-Hartman Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 52ab539..fc14275 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -720,14 +720,49 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ -void set_cpu_possible(unsigned int cpu, bool possible); -void set_cpu_present(unsigned int cpu, bool present); -void set_cpu_online(unsigned int cpu, bool online); -void set_cpu_active(unsigned int cpu, bool active); void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); +static inline void +set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, &__cpu_possible_mask); + else + cpumask_clear_cpu(cpu, &__cpu_possible_mask); +} + +static inline void +set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, &__cpu_present_mask); + else + cpumask_clear_cpu(cpu, &__cpu_present_mask); +} + +static inline void +set_cpu_online(unsigned int cpu, bool online) +{ + if (online) { + cpumask_set_cpu(cpu, &__cpu_online_mask); + cpumask_set_cpu(cpu, &__cpu_active_mask); + } else { + cpumask_clear_cpu(cpu, &__cpu_online_mask); + } +} + +static inline void +set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, &__cpu_active_mask); + else + cpumask_clear_cpu(cpu, &__cpu_active_mask); +} + + /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap diff --git a/kernel/cpu.c b/kernel/cpu.c index 8734fc7..5b9d396 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -775,40 +775,6 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, &__cpu_possible_mask); - else - cpumask_clear_cpu(cpu, &__cpu_possible_mask); -} - -void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, &__cpu_present_mask); - else - cpumask_clear_cpu(cpu, &__cpu_present_mask); -} - -void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) { - cpumask_set_cpu(cpu, &__cpu_online_mask); - cpumask_set_cpu(cpu, &__cpu_active_mask); - } else { - cpumask_clear_cpu(cpu, &__cpu_online_mask); - } -} - -void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, &__cpu_active_mask); - else - cpumask_clear_cpu(cpu, &__cpu_active_mask); -} - void init_cpu_present(const struct cpumask *src) { cpumask_copy(&__cpu_present_mask, src); -- cgit v0.10.2 From cdf4b3fa03bab157d2d70d4de65bb7ae319b084f Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jan 2016 15:00:31 -0800 Subject: kexec: set KEXEC_TYPE_CRASH before sanity_check_segment_list() sanity_check_segment_list() checks KEXEC_TYPE_CRASH flag to ensure all the segments of the loaded crash kernel are within the kernel crash resource limits, so set the flag beforehand. Signed-off-by: Xunlei Pang Acked-by: Dave Young Cc: Eric Biederman Cc: Vivek Goyal Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kexec.c b/kernel/kexec.c index d873b64..ee70aef 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (ret) goto out_free_image; - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_image; - - /* Enable the special crash kernel control page allocation policy. */ if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be -- cgit v0.10.2 From 2b24692b9235cb82b6f735b7a4c4137211ddf005 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 15:00:34 -0800 Subject: kernel/kexec_core.c: use list_for_each_entry_safe in kimage_free_page_list Use list_for_each_entry_safe() instead of list_for_each_safe() to simplify the code. Signed-off-by: Geliang Tang Cc: Dave Young Cc: Vivek Goyal Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c823f30..8dc6591 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page) void kimage_free_page_list(struct list_head *list) { - struct list_head *pos, *next; + struct page *page, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); + list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } -- cgit v0.10.2 From 978e30c9b46161c792ecdad0091fd017b21b8ca5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jan 2016 15:00:36 -0800 Subject: kexec: move some memembers and definitions within the scope of CONFIG_KEXEC_FILE Move the stuff currently only used by the kexec file code within CONFIG_KEXEC_FILE (and CONFIG_KEXEC_VERIFY_SIG). Also move internal "struct kexec_sha_region" and "struct kexec_buf" into "kexec_internal.h". Signed-off-by: Xunlei Pang Cc: "Eric W. Biederman" Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 819ab3f..ba7fbba 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +#ifdef CONFIG_KEXEC_VERIFY_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, unsigned long kernel_len) { @@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, return image->fops->verify_sig(kernel, kernel_len); } +#endif /* * Apply purgatory relocations. diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 7b68d27..2cc643c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -109,11 +109,7 @@ struct compat_kexec_segment { }; #endif -struct kexec_sha_region { - unsigned long start; - unsigned long len; -}; - +#ifdef CONFIG_KEXEC_FILE struct purgatory_info { /* Pointer to elf header of read only purgatory */ Elf_Ehdr *ehdr; @@ -130,6 +126,28 @@ struct purgatory_info { unsigned long purgatory_load_addr; }; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(void *loader_data); + +#ifdef CONFIG_KEXEC_VERIFY_SIG +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); +#endif + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +#ifdef CONFIG_KEXEC_VERIFY_SIG + kexec_verify_sig_t *verify_sig; +#endif +}; +#endif + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -161,6 +179,7 @@ struct kimage { struct kimage_arch arch; #endif +#ifdef CONFIG_KEXEC_FILE /* Additional fields for file based kexec syscall */ void *kernel_buf; unsigned long kernel_buf_len; @@ -179,38 +198,7 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; -}; - -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - -typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); -typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len); -typedef int (kexec_cleanup_t)(void *loader_data); -typedef int (kexec_verify_sig_t)(const char *kernel_buf, - unsigned long kernel_len); - -struct kexec_file_ops { - kexec_probe_t *probe; - kexec_load_t *load; - kexec_cleanup_t *cleanup; - kexec_verify_sig_t *verify_sig; +#endif }; /* kexec interface functions */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0..007b791 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return -EINVAL; } +#ifdef CONFIG_KEXEC_VERIFY_SIG int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return -EKEYREJECTED; } +#endif /* Apply relocations of type RELA */ int __weak diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index e4392a6..0a52315 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; + void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -- cgit v0.10.2 From a253f1eee6c471d5418983ca9aa9c756e7db5db9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 15:00:39 -0800 Subject: rapidio: use kobj_to_dev() Use kobj_to_dev() instead of open-coding it. Signed-off-by: Geliang Tang Acked-by: "Bounine, Alexandre" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c index cdb005c..eda4156 100644 --- a/drivers/rapidio/rio-sysfs.c +++ b/drivers/rapidio/rio-sysfs.c @@ -125,8 +125,7 @@ rio_read_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { - struct rio_dev *dev = - to_rio_dev(container_of(kobj, struct device, kobj)); + struct rio_dev *dev = to_rio_dev(kobj_to_dev(kobj)); unsigned int size = 0x100; loff_t init_off = off; u8 *data = (u8 *) buf; @@ -197,8 +196,7 @@ rio_write_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { - struct rio_dev *dev = - to_rio_dev(container_of(kobj, struct device, kobj)); + struct rio_dev *dev = to_rio_dev(kobj_to_dev(kobj)); unsigned int size = count; loff_t init_off = off; u8 *data = (u8 *) buf; -- cgit v0.10.2 From a460bece027301e079b9e53c5e0f67c8e3eaebc1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 20 Jan 2016 15:00:42 -0800 Subject: rbtree: use READ_ONCE in RB_EMPTY_ROOT With commit d72da4a4d97 ("rbtree: Make lockless searches non-fatal") our rbtrees provide weak guarantees that allows us to do lockless (and very speculative) reads of the tree. Such readers cannot see partial stores on nodes, ie left/right as well as root. As such, similar to the WRITE_ONCE semantics when doing rotations, use READ_ONCE when checking the root node in RB_EMPTY_ROOT. Signed-off-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: Michel Lespinasse Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index a5aa7ae..b690009 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -50,7 +50,7 @@ struct rb_root { #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ -- cgit v0.10.2 From 41662f5cc55335807d39404371cfcbb1909304c4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 20 Jan 2016 15:00:45 -0800 Subject: sysctl: enable strict writes SYSCTL_WRITES_WARN was added in commit f4aacea2f5d1 ("sysctl: allow for strict write position handling"), and released in v3.16 in August of 2014. Since then I can find only 1 instance of non-zero offset writing[1], and it was fixed immediately in CRIU[2]. As such, it appears safe to flip this to the strict state now. [1] https://www.google.com/search?q="when%20file%20position%20was%20not%200" [2] http://lists.openvz.org/pipermail/criu/2015-April/019819.html Signed-off-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 73c6b1e..a93b414 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -825,14 +825,13 @@ via the /proc/sys interface: Each write syscall must fully contain the sysctl value to be written, and multiple writes on the same sysctl file descriptor will rewrite the sysctl value, regardless of file position. - 0 - (default) Same behavior as above, but warn about processes that - perform writes to a sysctl file descriptor when the file position - is not 0. - 1 - Respect file position when writing sysctl strings. Multiple writes - will append to the sysctl value buffer. Anything past the max length - of the sysctl value buffer will be ignored. Writes to numeric sysctl - entries must always be at file position 0 and the value must be - fully contained in the buffer sent in the write syscall. + 0 - Same behavior as above, but warn about processes that perform writes + to a sysctl file descriptor when the file position is not 0. + 1 - (default) Respect file position when writing sysctl strings. Multiple + writes will append to the sysctl value buffer. Anything past the max + length of the sysctl value buffer will be ignored. Writes to numeric + sysctl entries must always be at file position 0 and the value must + be fully contained in the buffer sent in the write syscall. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c810f8a..9142036 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,7 +173,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v0.10.2 From 5c9cf8af2e77388f1da81c39237fb4f20c2f85d5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:48 -0800 Subject: kernel: printk: specify alignment for struct printk_log On architectures that have support for efficient unaligned access struct printk_log has 4-byte alignment. Specify alignment attribute in type declaration. The whole point of this patch is to fix deadlock which happening when UBSAN detects unaligned access in printk() thus UBSAN recursively calls printk() with logbuf_lock held by top printk() call. Signed-off-by: Andrey Ryabinin Cc: Peter Zijlstra Cc: Sasha Levin Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Jonathan Corbet Cc: Michal Marek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yury Gribov Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Kostya Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e794391..c963ba5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -233,7 +233,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -274,11 +278,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; -- cgit v0.10.2 From 68920c973254c5b71a684645c5f6f82d6732c5d6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:51 -0800 Subject: net/mac80211/debugfs.c: prevent build failure with CONFIG_UBSAN=y With upcoming CONFIG_UBSAN the following BUILD_BUG_ON in net/mac80211/debugfs.c starts to trigger: BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); It seems, that compiler instrumentation causes some code deoptimizations. Because of that GCC is not being able to resolve condition in BUILD_BUG_ON() at compile time. We could make size of hw_flag_names array unspecified and replace the condition in BUILD_BUG_ON() with following: ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS That will have the same effect as before (adding new flag without updating array will trigger build failure) except it doesn't fail with CONFIG_UBSAN. As a bonus this patch slightly decreases size of hw_flag_names array. Signed-off-by: Andrey Ryabinin Cc: Johannes Berg Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index abbdff0..3e24d0d 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -91,7 +91,7 @@ static const struct file_operations reset_ops = { }; #endif -static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { +static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), @@ -126,9 +126,6 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), - - /* keep last for the build bug below */ - (void *)0x1 #undef FLAG }; @@ -148,7 +145,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, /* fail compilation if somebody adds or removes * a flag without updating the name array above */ - BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); + BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) -- cgit v0.10.2 From c6d308534aef6c99904bf5862066360ae067abc4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 20 Jan 2016 15:00:55 -0800 Subject: UBSAN: run-time undefined behavior sanity checker UBSAN uses compile-time instrumentation to catch undefined behavior (UB). Compiler inserts code that perform certain kinds of checks before operations that could cause UB. If check fails (i.e. UB detected) __ubsan_handle_* function called to print error message. So the most of the work is done by compiler. This patch just implements ubsan handlers printing errors. GCC has this capability since 4.9.x [1] (see -fsanitize=undefined option and its suboptions). However GCC 5.x has more checkers implemented [2]. Article [3] has a bit more details about UBSAN in the GCC. [1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html [2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html [3] - http://developerblog.redhat.com/2014/10/16/gcc-undefined-behavior-sanitizer-ubsan/ Issues which UBSAN has found thus far are: Found bugs: * out-of-bounds access - 97840cb67ff5 ("netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind") undefined shifts: * d48458d4a768 ("jbd2: use a better hash function for the revoke table") * 10632008b9e1 ("clockevents: Prevent shift out of bounds") * 'x << -1' shift in ext4 - http://lkml.kernel.org/r/<5444EF21.8020501@samsung.com> * undefined rol32(0) - http://lkml.kernel.org/r/<1449198241-20654-1-git-send-email-sasha.levin@oracle.com> * undefined dirty_ratelimit calculation - http://lkml.kernel.org/r/<566594E2.3050306@odin.com> * undefined roundown_pow_of_two(0) - http://lkml.kernel.org/r/<1449156616-11474-1-git-send-email-sasha.levin@oracle.com> * [WONTFIX] undefined shift in __bpf_prog_run - http://lkml.kernel.org/r/ WONTFIX here because it should be fixed in bpf program, not in kernel. signed overflows: * 32a8df4e0b33f ("sched: Fix odd values in effective_load() calculations") * mul overflow in ntp - http://lkml.kernel.org/r/<1449175608-1146-1-git-send-email-sasha.levin@oracle.com> * incorrect conversion into rtc_time in rtc_time64_to_tm() - http://lkml.kernel.org/r/<1449187944-11730-1-git-send-email-sasha.levin@oracle.com> * unvalidated timespec in io_getevents() - http://lkml.kernel.org/r/ * [NOTABUG] signed overflow in ktime_add_safe() - http://lkml.kernel.org/r/ [akpm@linux-foundation.org: fix unused local warning] [akpm@linux-foundation.org: fix __int128 build woes] Signed-off-by: Andrey Ryabinin Cc: Peter Zijlstra Cc: Sasha Levin Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Jonathan Corbet Cc: Michal Marek Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yury Gribov Cc: Dmitry Vyukov Cc: Konstantin Khlebnikov Cc: Kostya Serebryany Cc: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/ubsan.txt b/Documentation/ubsan.txt new file mode 100644 index 0000000..f58215e --- /dev/null +++ b/Documentation/ubsan.txt @@ -0,0 +1,84 @@ +Undefined Behavior Sanitizer - UBSAN + +Overview +-------- + +UBSAN is a runtime undefined behaviour checker. + +UBSAN uses compile-time instrumentation to catch undefined behavior (UB). +Compiler inserts code that perform certain kinds of checks before operations +that may cause UB. If check fails (i.e. UB detected) __ubsan_handle_* +function called to print error message. + +GCC has that feature since 4.9.x [1] (see -fsanitize=undefined option and +its suboptions). GCC 5.x has more checkers implemented [2]. + +Report example +--------------- + + ================================================================================ + UBSAN: Undefined behaviour in ../include/linux/bitops.h:110:33 + shift exponent 32 is to large for 32-bit type 'unsigned int' + CPU: 0 PID: 0 Comm: swapper Not tainted 4.4.0-rc1+ #26 + 0000000000000000 ffffffff82403cc8 ffffffff815e6cd6 0000000000000001 + ffffffff82403cf8 ffffffff82403ce0 ffffffff8163a5ed 0000000000000020 + ffffffff82403d78 ffffffff8163ac2b ffffffff815f0001 0000000000000002 + Call Trace: + [] dump_stack+0x45/0x5f + [] ubsan_epilogue+0xd/0x40 + [] __ubsan_handle_shift_out_of_bounds+0xeb/0x130 + [] ? radix_tree_gang_lookup_slot+0x51/0x150 + [] _mix_pool_bytes+0x1e6/0x480 + [] ? dmi_walk_early+0x48/0x5c + [] add_device_randomness+0x61/0x130 + [] ? dmi_save_one_device+0xaa/0xaa + [] dmi_walk_early+0x48/0x5c + [] dmi_scan_machine+0x278/0x4b4 + [] ? vprintk_default+0x1a/0x20 + [] ? early_idt_handler_array+0x120/0x120 + [] setup_arch+0x405/0xc2c + [] ? early_idt_handler_array+0x120/0x120 + [] start_kernel+0x83/0x49a + [] ? early_idt_handler_array+0x120/0x120 + [] x86_64_start_reservations+0x2a/0x2c + [] x86_64_start_kernel+0x16b/0x17a + ================================================================================ + +Usage +----- + +To enable UBSAN configure kernel with: + + CONFIG_UBSAN=y + +and to check the entire kernel: + + CONFIG_UBSAN_SANITIZE_ALL=y + +To enable instrumentation for specific files or directories, add a line +similar to the following to the respective kernel Makefile: + + For a single file (e.g. main.o): + UBSAN_SANITIZE_main.o := y + + For all files in one directory: + UBSAN_SANITIZE := y + +To exclude files from being instrumented even if +CONFIG_UBSAN_SANITIZE_ALL=y, use: + + UBSAN_SANITIZE_main.o := n + and: + UBSAN_SANITIZE := n + +Detection of unaligned accesses controlled through the separate option - +CONFIG_UBSAN_ALIGNMENT. It's off by default on architectures that support +unaligned accesses (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y). One could +still enable it in config, just note that it will produce a lot of UBSAN +reports. + +References +---------- + +[1] - https://gcc.gnu.org/onlinedocs/gcc-4.9.0/gcc/Debugging-Options.html +[2] - https://gcc.gnu.org/onlinedocs/gcc/Debugging-Options.html diff --git a/Makefile b/Makefile index 7f4ac1e..abfb3e8 100644 --- a/Makefile +++ b/Makefile @@ -411,7 +411,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS -export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL @@ -784,6 +784,7 @@ endif include scripts/Makefile.kasan include scripts/Makefile.extrawarn +include scripts/Makefile.ubsan # Add any arch overrides and user supplied CPPFLAGS, AFLAGS and CFLAGS as the # last assignments diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4a10ba9..92b2a73 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN + select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 2ee62db..bbe1a62 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -60,6 +60,7 @@ clean-files += cpustr.h KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0a291cd..f9ce75d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE :=n LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 265c0ed..c854541 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -4,6 +4,7 @@ KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n +UBSAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2730d77..3e75fcf 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -70,3 +70,4 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n +UBSAN_SANITIZE := n diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 9c12e18..aaf9c0b 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -22,6 +22,7 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ GCOV_PROFILE := n KASAN_SANITIZE := n +UBSAN_SANITIZE := n lib-y := efi-stub-helper.o diff --git a/include/linux/sched.h b/include/linux/sched.h index 61aa9bb..02dabf2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1643,6 +1643,9 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; gfp_t lockdep_reclaim_gfp; #endif +#ifdef CONFIG_UBSAN + unsigned int in_ubsan; +#endif /* journalling filesystem info */ void *journal_info; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f75a33f..157220b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1893,6 +1893,8 @@ source "samples/Kconfig" source "lib/Kconfig.kgdb" +source "lib/Kconfig.ubsan" + config ARCH_HAS_DEVMEM_IS_ALLOWED bool diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan new file mode 100644 index 0000000..49518fb --- /dev/null +++ b/lib/Kconfig.ubsan @@ -0,0 +1,29 @@ +config ARCH_HAS_UBSAN_SANITIZE_ALL + bool + +config UBSAN + bool "Undefined behaviour sanity checker" + help + This option enables undefined behaviour sanity checker + Compile-time instrumentation is used to detect various undefined + behaviours in runtime. Various types of checks may be enabled + via boot parameter ubsan_handle (see: Documentation/ubsan.txt). + +config UBSAN_SANITIZE_ALL + bool "Enable instrumentation for the entire kernel" + depends on UBSAN + depends on ARCH_HAS_UBSAN_SANITIZE_ALL + default y + help + This option activates instrumentation for the entire kernel. + If you don't enable this option, you have to explicitly specify + UBSAN_SANITIZE := y for the files/directories you want to check for UB. + +config UBSAN_ALIGNMENT + bool "Enable checking of pointers alignment" + depends on UBSAN + default y if !HAVE_EFFICIENT_UNALIGNED_ACCESS + help + This option enables detection of unaligned memory accesses. + Enabling this option on architectures that support unalligned + accesses may produce a lot of false positives. diff --git a/lib/Makefile b/lib/Makefile index b2a82e6..2d4bc33 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -209,3 +209,6 @@ quiet_cmd_build_OID_registry = GEN $@ clean-files += oid_registry_data.c obj-$(CONFIG_UCS2_STRING) += ucs2_string.o +obj-$(CONFIG_UBSAN) += ubsan.o + +UBSAN_SANITIZE_ubsan.o := n diff --git a/lib/ubsan.c b/lib/ubsan.c new file mode 100644 index 0000000..8799ae5 --- /dev/null +++ b/lib/ubsan.c @@ -0,0 +1,456 @@ +/* + * UBSAN error reporting functions + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ubsan.h" + +const char *type_check_kinds[] = { + "load of", + "store to", + "reference binding to", + "member access within", + "member call on", + "constructor call on", + "downcast of", + "downcast of" +}; + +#define REPORTED_BIT 31 + +#if (BITS_PER_LONG == 64) && defined(__BIG_ENDIAN) +#define COLUMN_MASK (~(1U << REPORTED_BIT)) +#define LINE_MASK (~0U) +#else +#define COLUMN_MASK (~0U) +#define LINE_MASK (~(1U << REPORTED_BIT)) +#endif + +#define VALUE_LENGTH 40 + +static bool was_reported(struct source_location *location) +{ + return test_and_set_bit(REPORTED_BIT, &location->reported); +} + +static void print_source_location(const char *prefix, + struct source_location *loc) +{ + pr_err("%s %s:%d:%d\n", prefix, loc->file_name, + loc->line & LINE_MASK, loc->column & COLUMN_MASK); +} + +static bool suppress_report(struct source_location *loc) +{ + return current->in_ubsan || was_reported(loc); +} + +static bool type_is_int(struct type_descriptor *type) +{ + return type->type_kind == type_kind_int; +} + +static bool type_is_signed(struct type_descriptor *type) +{ + WARN_ON(!type_is_int(type)); + return type->type_info & 1; +} + +static unsigned type_bit_width(struct type_descriptor *type) +{ + return 1 << (type->type_info >> 1); +} + +static bool is_inline_int(struct type_descriptor *type) +{ + unsigned inline_bits = sizeof(unsigned long)*8; + unsigned bits = type_bit_width(type); + + WARN_ON(!type_is_int(type)); + + return bits <= inline_bits; +} + +static s_max get_signed_val(struct type_descriptor *type, unsigned long val) +{ + if (is_inline_int(type)) { + unsigned extra_bits = sizeof(s_max)*8 - type_bit_width(type); + return ((s_max)val) << extra_bits >> extra_bits; + } + + if (type_bit_width(type) == 64) + return *(s64 *)val; + + return *(s_max *)val; +} + +static bool val_is_negative(struct type_descriptor *type, unsigned long val) +{ + return type_is_signed(type) && get_signed_val(type, val) < 0; +} + +static u_max get_unsigned_val(struct type_descriptor *type, unsigned long val) +{ + if (is_inline_int(type)) + return val; + + if (type_bit_width(type) == 64) + return *(u64 *)val; + + return *(u_max *)val; +} + +static void val_to_string(char *str, size_t size, struct type_descriptor *type, + unsigned long value) +{ + if (type_is_int(type)) { + if (type_bit_width(type) == 128) { +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) + u_max val = get_unsigned_val(type, value); + + scnprintf(str, size, "0x%08x%08x%08x%08x", + (u32)(val >> 96), + (u32)(val >> 64), + (u32)(val >> 32), + (u32)(val)); +#else + WARN_ON(1); +#endif + } else if (type_is_signed(type)) { + scnprintf(str, size, "%lld", + (s64)get_signed_val(type, value)); + } else { + scnprintf(str, size, "%llu", + (u64)get_unsigned_val(type, value)); + } + } +} + +static bool location_is_valid(struct source_location *loc) +{ + return loc->file_name != NULL; +} + +static DEFINE_SPINLOCK(report_lock); + +static void ubsan_prologue(struct source_location *location, + unsigned long *flags) +{ + current->in_ubsan++; + spin_lock_irqsave(&report_lock, *flags); + + pr_err("========================================" + "========================================\n"); + print_source_location("UBSAN: Undefined behaviour in", location); +} + +static void ubsan_epilogue(unsigned long *flags) +{ + dump_stack(); + pr_err("========================================" + "========================================\n"); + spin_unlock_irqrestore(&report_lock, *flags); + current->in_ubsan--; +} + +static void handle_overflow(struct overflow_data *data, unsigned long lhs, + unsigned long rhs, char op) +{ + + struct type_descriptor *type = data->type; + unsigned long flags; + char lhs_val_str[VALUE_LENGTH]; + char rhs_val_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(lhs_val_str, sizeof(lhs_val_str), type, lhs); + val_to_string(rhs_val_str, sizeof(rhs_val_str), type, rhs); + pr_err("%s integer overflow:\n", + type_is_signed(type) ? "signed" : "unsigned"); + pr_err("%s %c %s cannot be represented in type %s\n", + lhs_val_str, + op, + rhs_val_str, + type->type_name); + + ubsan_epilogue(&flags); +} + +void __ubsan_handle_add_overflow(struct overflow_data *data, + unsigned long lhs, + unsigned long rhs) +{ + + handle_overflow(data, lhs, rhs, '+'); +} +EXPORT_SYMBOL(__ubsan_handle_add_overflow); + +void __ubsan_handle_sub_overflow(struct overflow_data *data, + unsigned long lhs, + unsigned long rhs) +{ + handle_overflow(data, lhs, rhs, '-'); +} +EXPORT_SYMBOL(__ubsan_handle_sub_overflow); + +void __ubsan_handle_mul_overflow(struct overflow_data *data, + unsigned long lhs, + unsigned long rhs) +{ + handle_overflow(data, lhs, rhs, '*'); +} +EXPORT_SYMBOL(__ubsan_handle_mul_overflow); + +void __ubsan_handle_negate_overflow(struct overflow_data *data, + unsigned long old_val) +{ + unsigned long flags; + char old_val_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(old_val_str, sizeof(old_val_str), data->type, old_val); + + pr_err("negation of %s cannot be represented in type %s:\n", + old_val_str, data->type->type_name); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_negate_overflow); + + +void __ubsan_handle_divrem_overflow(struct overflow_data *data, + unsigned long lhs, + unsigned long rhs) +{ + unsigned long flags; + char rhs_val_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(rhs_val_str, sizeof(rhs_val_str), data->type, rhs); + + if (type_is_signed(data->type) && get_signed_val(data->type, rhs) == -1) + pr_err("division of %s by -1 cannot be represented in type %s\n", + rhs_val_str, data->type->type_name); + else + pr_err("division by zero\n"); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_divrem_overflow); + +static void handle_null_ptr_deref(struct type_mismatch_data *data) +{ + unsigned long flags; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + pr_err("%s null pointer of type %s\n", + type_check_kinds[data->type_check_kind], + data->type->type_name); + + ubsan_epilogue(&flags); +} + +static void handle_missaligned_access(struct type_mismatch_data *data, + unsigned long ptr) +{ + unsigned long flags; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + pr_err("%s misaligned address %p for type %s\n", + type_check_kinds[data->type_check_kind], + (void *)ptr, data->type->type_name); + pr_err("which requires %ld byte alignment\n", data->alignment); + + ubsan_epilogue(&flags); +} + +static void handle_object_size_mismatch(struct type_mismatch_data *data, + unsigned long ptr) +{ + unsigned long flags; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + pr_err("%s address %pk with insufficient space\n", + type_check_kinds[data->type_check_kind], + (void *) ptr); + pr_err("for an object of type %s\n", data->type->type_name); + ubsan_epilogue(&flags); +} + +void __ubsan_handle_type_mismatch(struct type_mismatch_data *data, + unsigned long ptr) +{ + + if (!ptr) + handle_null_ptr_deref(data); + else if (data->alignment && !IS_ALIGNED(ptr, data->alignment)) + handle_missaligned_access(data, ptr); + else + handle_object_size_mismatch(data, ptr); +} +EXPORT_SYMBOL(__ubsan_handle_type_mismatch); + +void __ubsan_handle_nonnull_return(struct nonnull_return_data *data) +{ + unsigned long flags; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + pr_err("null pointer returned from function declared to never return null\n"); + + if (location_is_valid(&data->attr_location)) + print_source_location("returns_nonnull attribute specified in", + &data->attr_location); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_nonnull_return); + +void __ubsan_handle_vla_bound_not_positive(struct vla_bound_data *data, + unsigned long bound) +{ + unsigned long flags; + char bound_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(bound_str, sizeof(bound_str), data->type, bound); + pr_err("variable length array bound value %s <= 0\n", bound_str); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_vla_bound_not_positive); + +void __ubsan_handle_out_of_bounds(struct out_of_bounds_data *data, + unsigned long index) +{ + unsigned long flags; + char index_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(index_str, sizeof(index_str), data->index_type, index); + pr_err("index %s is out of range for type %s\n", index_str, + data->array_type->type_name); + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_out_of_bounds); + +void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, + unsigned long lhs, unsigned long rhs) +{ + unsigned long flags; + struct type_descriptor *rhs_type = data->rhs_type; + struct type_descriptor *lhs_type = data->lhs_type; + char rhs_str[VALUE_LENGTH]; + char lhs_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(rhs_str, sizeof(rhs_str), rhs_type, rhs); + val_to_string(lhs_str, sizeof(lhs_str), lhs_type, lhs); + + if (val_is_negative(rhs_type, rhs)) + pr_err("shift exponent %s is negative\n", rhs_str); + + else if (get_unsigned_val(rhs_type, rhs) >= + type_bit_width(lhs_type)) + pr_err("shift exponent %s is too large for %u-bit type %s\n", + rhs_str, + type_bit_width(lhs_type), + lhs_type->type_name); + else if (val_is_negative(lhs_type, lhs)) + pr_err("left shift of negative value %s\n", + lhs_str); + else + pr_err("left shift of %s by %s places cannot be" + " represented in type %s\n", + lhs_str, rhs_str, + lhs_type->type_name); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); + + +void __noreturn +__ubsan_handle_builtin_unreachable(struct unreachable_data *data) +{ + unsigned long flags; + + ubsan_prologue(&data->location, &flags); + pr_err("calling __builtin_unreachable()\n"); + ubsan_epilogue(&flags); + panic("can't return from __builtin_unreachable()"); +} +EXPORT_SYMBOL(__ubsan_handle_builtin_unreachable); + +void __ubsan_handle_load_invalid_value(struct invalid_value_data *data, + unsigned long val) +{ + unsigned long flags; + char val_str[VALUE_LENGTH]; + + if (suppress_report(&data->location)) + return; + + ubsan_prologue(&data->location, &flags); + + val_to_string(val_str, sizeof(val_str), data->type, val); + + pr_err("load of value %s is not a valid value for type %s\n", + val_str, data->type->type_name); + + ubsan_epilogue(&flags); +} +EXPORT_SYMBOL(__ubsan_handle_load_invalid_value); diff --git a/lib/ubsan.h b/lib/ubsan.h new file mode 100644 index 0000000..b2d18d4 --- /dev/null +++ b/lib/ubsan.h @@ -0,0 +1,84 @@ +#ifndef _LIB_UBSAN_H +#define _LIB_UBSAN_H + +enum { + type_kind_int = 0, + type_kind_float = 1, + type_unknown = 0xffff +}; + +struct type_descriptor { + u16 type_kind; + u16 type_info; + char type_name[1]; +}; + +struct source_location { + const char *file_name; + union { + unsigned long reported; + struct { + u32 line; + u32 column; + }; + }; +}; + +struct overflow_data { + struct source_location location; + struct type_descriptor *type; +}; + +struct type_mismatch_data { + struct source_location location; + struct type_descriptor *type; + unsigned long alignment; + unsigned char type_check_kind; +}; + +struct nonnull_arg_data { + struct source_location location; + struct source_location attr_location; + int arg_index; +}; + +struct nonnull_return_data { + struct source_location location; + struct source_location attr_location; +}; + +struct vla_bound_data { + struct source_location location; + struct type_descriptor *type; +}; + +struct out_of_bounds_data { + struct source_location location; + struct type_descriptor *array_type; + struct type_descriptor *index_type; +}; + +struct shift_out_of_bounds_data { + struct source_location location; + struct type_descriptor *lhs_type; + struct type_descriptor *rhs_type; +}; + +struct unreachable_data { + struct source_location location; +}; + +struct invalid_value_data { + struct source_location location; + struct type_descriptor *type; +}; + +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) +typedef __int128 s_max; +typedef unsigned __int128 u_max; +#else +typedef s64 s_max; +typedef u64 u_max; +#endif + +#endif diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 6471014..a61460d 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,4 +1,5 @@ KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 39d6bb1..2edbcad 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -130,6 +130,12 @@ _c_flags += $(if $(patsubst n%,, \ $(CFLAGS_KASAN)) endif +ifeq ($(CONFIG_UBSAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \ + $(CFLAGS_UBSAN)) +endif + # If building the kernel in a separate objtree expand all occurrences # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/'). diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan new file mode 100644 index 0000000..8ab6867 --- /dev/null +++ b/scripts/Makefile.ubsan @@ -0,0 +1,17 @@ +ifdef CONFIG_UBSAN + CFLAGS_UBSAN += $(call cc-option, -fsanitize=shift) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=integer-divide-by-zero) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=unreachable) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=vla-bound) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=null) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=signed-integer-overflow) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=object-size) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=returns-nonnull-attribute) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=bool) + CFLAGS_UBSAN += $(call cc-option, -fsanitize=enum) + +ifdef CONFIG_UBSAN_ALIGNMENT + CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment) +endif +endif -- cgit v0.10.2 From bf76f73c5f6554df1bd337aea5b3ea561f09632c Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 20 Jan 2016 15:00:58 -0800 Subject: powerpc: enable UBSAN support This hooks up UBSAN support for PowerPC. So far it's found some interesting cases where we don't properly sanitise input to shifts, including one in our futex handling. Nothing critical, but interesting and worth fixing. [valentinrothberg@gmail.com: arch/powerpc/Kconfig: fix typo in select statement] Signed-off-by: Daniel Axtens Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Tested-by: Andrew Donnellan Acked-by: Michael Ellerman Signed-off-by: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 94f6c50..8310be4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -158,6 +158,7 @@ config PPC select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_DEVMEM_IS_ALLOWED select HAVE_ARCH_SECCOMP_FILTER + select ARCH_HAS_UBSAN_SANITIZE_ALL config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ba33693..794f22a 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -136,12 +136,18 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o -# Disable GCOV in odd or sensitive code +# Disable GCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n +UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_ftrace.o := n +UBSAN_SANITIZE_ftrace.o := n GCOV_PROFILE_machine_kexec_64.o := n +UBSAN_SANITIZE_machine_kexec_64.o := n GCOV_PROFILE_machine_kexec_32.o := n +UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n +UBSAN_SANITIZE_kprobes.o := n +UBSAN_SANITIZE_vdso.o := n extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 6abffb7..cbabd14 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -15,6 +15,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) GCOV_PROFILE := n +UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 8c8f2ae..c710802 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -8,6 +8,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) GCOV_PROFILE := n +UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 1278788..436062d 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -3,6 +3,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror GCOV_PROFILE := n +UBSAN_SANITIZE := n ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -- cgit v0.10.2 From ddf1d398e517e660207e2c807f76a90df543a217 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Jan 2016 15:01:02 -0800 Subject: prctl: take mmap sem for writing to protect against others An unprivileged user can trigger an oops on a kernel with CONFIG_CHECKPOINT_RESTORE. proc_pid_cmdline_read takes mmap_sem for reading and obtains args + env start/end values. These get sanity checked as follows: BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); These can be changed by prctl_set_mm. Turns out also takes the semaphore for reading, effectively rendering it useless. This results in: kernel BUG at fs/proc/base.c:240! invalid opcode: 0000 [#1] SMP Modules linked in: virtio_net CPU: 0 PID: 925 Comm: a.out Not tainted 4.4.0-rc8-next-20160105dupa+ #71 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880077a68000 ti: ffff8800784d0000 task.ti: ffff8800784d0000 RIP: proc_pid_cmdline_read+0x520/0x530 RSP: 0018:ffff8800784d3db8 EFLAGS: 00010206 RAX: ffff880077c5b6b0 RBX: ffff8800784d3f18 RCX: 0000000000000000 RDX: 0000000000000002 RSI: 00007f78e8857000 RDI: 0000000000000246 RBP: ffff8800784d3e40 R08: 0000000000000008 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000050 R13: 00007f78e8857800 R14: ffff88006fcef000 R15: ffff880077c5b600 FS: 00007f78e884a740(0000) GS:ffff88007b200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f78e8361770 CR3: 00000000790a5000 CR4: 00000000000006f0 Call Trace: __vfs_read+0x37/0x100 vfs_read+0x82/0x130 SyS_read+0x58/0xd0 entry_SYSCALL_64_fastpath+0x12/0x76 Code: 4c 8b 7d a8 eb e9 48 8b 9d 78 ff ff ff 4c 8b 7d 90 48 8b 03 48 39 45 a8 0f 87 f0 fe ff ff e9 d1 fe ff ff 4c 8b 7d 90 eb c6 0f 0b <0f> 0b 0f 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 RIP proc_pid_cmdline_read+0x520/0x530 ---[ end trace 97882617ae9c6818 ]--- Turns out there are instances where the code just reads aformentioned values without locking whatsoever - namely environ_read and get_cmdline. Interestingly these functions look quite resilient against bogus values, but I don't believe this should be relied upon. The first patch gets rid of the oops bug by grabbing mmap_sem for writing. The second patch is optional and puts locking around aformentioned consumers for safety. Consumers of other fields don't seem to benefit from similar treatment and are left untouched. This patch (of 2): The code was taking the semaphore for reading, which does not protect against readers nor concurrent modifications. The problem could cause a sanity checks to fail in procfs's cmdline reader, resulting in an OOPS. Note that some functions perform an unlocked read of various mm fields, but they seem to be fine despite possible modificaton. Signed-off-by: Mateusz Guzik Acked-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Jarod Wilson Cc: Jan Stancek Cc: Al Viro Cc: Anshuman Khandual Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/sys.c b/kernel/sys.c index 6af9212..78947de 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); prctl_map.start_code = mm->start_code; @@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } -- cgit v0.10.2 From a3b609ef9f8b1dbfe97034ccad6cd3fe71fbe7ab Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Jan 2016 15:01:05 -0800 Subject: proc read mm's {arg,env}_{start,end} with mmap semaphore taken. Only functions doing more than one read are modified. Consumeres happened to deal with possibly changing data, but it does not seem like a good thing to rely on. Signed-off-by: Mateusz Guzik Acked-by: Cyrill Gorcunov Cc: Alexey Dobriyan Cc: Jarod Wilson Cc: Jan Stancek Cc: Al Viro Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/base.c b/fs/proc/base.c index e665097..4f764c2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -953,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; + unsigned long env_start, env_end; if (!mm) return 0; @@ -964,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf, ret = 0; if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + + down_read(&mm->mmap_sem); + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + while (count > 0) { size_t this_len, max_len; int retval; - if (src >= (mm->env_end - mm->env_start)) + if (src >= (env_end - env_start)) break; - this_len = mm->env_end - (mm->env_start + src); + this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (mm->env_start + src), + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { diff --git a/mm/util.c b/mm/util.c index 6d1f920..c108a65 100644 --- a/mm/util.c +++ b/mm/util.c @@ -476,17 +476,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - len = mm->arg_end - mm->arg_start; + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + len = arg_end - arg_start; if (len > buflen) len = buflen; - res = access_process_vm(task, mm->arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, 0); /* * If the nul at the end of args has been overwritten, then @@ -497,10 +505,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len < res) { res = len; } else { - len = mm->env_end - mm->env_start; + len = env_end - env_start; if (len > buflen - res) len = buflen - res; - res += access_process_vm(task, mm->env_start, + res += access_process_vm(task, env_start, buffer+res, len, 0); res = strnlen(buffer, res); } -- cgit v0.10.2 From 06af1c52c9ea234e0b1266cc0b52c3e0c6c8fe9f Mon Sep 17 00:00:00 2001 From: Bongkyu Kim Date: Wed, 20 Jan 2016 15:01:08 -0800 Subject: lz4: fix wrong compress buffer size for 64-bits The current lz4 compress buffer is 16kb on 32-bits, 32kb on 64-bits system. But, lz4 needs only 16kb on both. On 64-bits, this causes wasted cpu cycles for additional memset during every compression. In case of lz4hc, the current buffer size is (256kb + 8) on 32-bits, (512kb + 16) on 64-bits. But, lz4hc needs only (256kb + 2 * pointer) on both. This patch fixes these wrong compress buffer sizes for 64-bits. Signed-off-by: Bongkyu Kim Cc: Chanho Min Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 4356686..6b784c5 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -9,8 +9,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) -#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) +#define LZ4_MEM_COMPRESS (16384) +#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) /* * lz4_compressbound() -- cgit v0.10.2 From 2954e440be7305134be632a94536b412899490f7 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 20 Jan 2016 15:01:11 -0800 Subject: ipc/shm.c: is_file_shm_hugepages() can be boolean Make is_file_shm_hugepages() return bool to improve readability due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/shm.h b/include/linux/shm.h index 6fb8016..04e8818 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -52,7 +52,7 @@ struct sysv_shm { long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); -int is_file_shm_hugepages(struct file *file); +bool is_file_shm_hugepages(struct file *file); void exit_shm(struct task_struct *task); #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else @@ -66,9 +66,9 @@ static inline long do_shmat(int shmid, char __user *shmaddr, { return -ENOSYS; } -static inline int is_file_shm_hugepages(struct file *file) +static inline bool is_file_shm_hugepages(struct file *file) { - return 0; + return false; } static inline void exit_shm(struct task_struct *task) { diff --git a/ipc/shm.c b/ipc/shm.c index 4178727..ed3027d 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -459,7 +459,7 @@ static const struct file_operations shm_file_operations_huge = { .fallocate = shm_fallocate, }; -int is_file_shm_hugepages(struct file *file) +bool is_file_shm_hugepages(struct file *file) { return file->f_op == &shm_file_operations_huge; } -- cgit v0.10.2 From e458bcd16f5bec6f19b60ef957b4f88af95aa78a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 20 Jan 2016 15:01:13 -0800 Subject: fs/overlayfs/super.c needs pagemap.h i386 allmodconfig: In file included from fs/overlayfs/super.c:10:0: fs/overlayfs/super.c: In function 'ovl_fill_super': include/linux/fs.h:898:36: error: 'PAGE_CACHE_SIZE' undeclared (first use in this function) #define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) ^ fs/overlayfs/super.c:939:19: note: in expansion of macro 'MAX_LFS_FILESIZE' sb->s_maxbytes = MAX_LFS_FILESIZE; ^ include/linux/fs.h:898:36: note: each undeclared identifier is reported only once for each function it appears in #define MAX_LFS_FILESIZE (((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) ^ fs/overlayfs/super.c:939:19: note: in expansion of macro 'MAX_LFS_FILESIZE' sb->s_maxbytes = MAX_LFS_FILESIZE; ^ Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e38ee0f..0eb9d3b 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From 90d6cd51af1aa275a302846b35b9638c870d2af5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 20 Jan 2016 15:01:16 -0800 Subject: fs/adfs/adfs.h: tidy up comments Lots of needless 80-col overflows. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index ea4aba5..fadf408 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -44,24 +44,24 @@ struct adfs_dir_ops; */ struct adfs_sb_info { union { struct { - struct adfs_discmap *s_map; /* bh list containing map */ - const struct adfs_dir_ops *s_dir; /* directory operations */ + struct adfs_discmap *s_map; /* bh list containing map */ + const struct adfs_dir_ops *s_dir; /* directory operations */ }; - struct rcu_head rcu; /* used only at shutdown time */ + struct rcu_head rcu; /* used only at shutdown time */ }; - kuid_t s_uid; /* owner uid */ - kgid_t s_gid; /* owner gid */ - umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ - umode_t s_other_mask; /* ADFS other perm -> unix perm */ + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ int s_ftsuffix; /* ,xyz hex filetype suffix option */ - __u32 s_ids_per_zone; /* max. no ids in one zone */ - __u32 s_idlen; /* length of ID in map */ - __u32 s_map_size; /* sector size of a map */ - unsigned long s_size; /* total size (in blocks) of this fs */ - signed int s_map2blk; /* shift left by this for map->sector */ - unsigned int s_log2sharesize;/* log2 share size */ - __le32 s_version; /* disc format version */ + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector*/ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ unsigned int s_namelen; /* maximum number of characters in name */ }; -- cgit v0.10.2 From f9ed89e17ee7d1f5f25615bb0080b9a3ff1bb5f0 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 20 Jan 2016 15:01:19 -0800 Subject: iio: core: fix ptr_ret.cocci warnings drivers/iio/industrialio-sw-trigger.c:169:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Fengguang Wu Cc: Joel Becker Cc: Lars-Peter Clausen Cc: Christoph Hellwig Cc: Hartmut Knaack Cc: Octavian Purdila Cc: Paul Bolle Cc: Adriana Reus Cc: Daniel Baluta Cc: Cristina Opriceana Cc: Peter Meerwald Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/iio/industrialio-sw-trigger.c b/drivers/iio/industrialio-sw-trigger.c index 311f9fe..8d24fb1 100644 --- a/drivers/iio/industrialio-sw-trigger.c +++ b/drivers/iio/industrialio-sw-trigger.c @@ -167,9 +167,7 @@ static int __init iio_sw_trigger_init(void) configfs_register_default_group(&iio_configfs_subsys.su_group, "triggers", &iio_triggers_group_type); - if (IS_ERR(iio_triggers_group)) - return PTR_ERR(iio_triggers_group); - return 0; + return PTR_ERR_OR_ZERO(iio_triggers_group); } module_init(iio_sw_trigger_init); -- cgit v0.10.2 From 0d4a619b64bad7117947a84a10c17a2b8f14d252 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:22 -0800 Subject: dma-mapping: make the generic coherent dma mmap implementation optional This series converts all remaining architectures to use dma_map_ops and the generic implementation of the DMA API. This not only simplifies the code a lot, but also prepares for possible future changes like more generic non-iommu dma_ops implementations or generic per-device dma_map_ops. This patch (of 16): We have a couple architectures that do not want to support this code, so add another Kconfig symbol that disables the code similar to what we do for the nommu case. Signed-off-by: Christoph Hellwig Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Steven Miao Cc: Ley Foon Tan Cc: David Howells Cc: Koichi Yasutake Cc: Chris Metcalf Cc: "David S. Miller" Cc: Aurelien Jacquiot Cc: Geert Uytterhoeven Cc: Helge Deller Cc: James Hogan Cc: Jesper Nilsson Cc: Mark Salter Cc: Mikael Starvik Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/Kconfig b/arch/Kconfig index ba1b626..51c03ef 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -632,4 +632,7 @@ config OLD_SIGACTION config COMPAT_OLD_SIGACTION bool +config ARCH_NO_COHERENT_DMA_MMAP + bool + source "kernel/gcov/Kconfig" diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index d95c597..381e39d 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c @@ -247,7 +247,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { int ret = -ENXIO; -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && !defined(CONFIG_ARCH_NO_COHERENT_DMA_MMAP) unsigned long user_count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr)); @@ -264,7 +264,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, user_count << PAGE_SHIFT, vma->vm_page_prot); } -#endif /* CONFIG_MMU */ +#endif /* CONFIG_MMU && !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ return ret; } -- cgit v0.10.2 From 052c96dbe33b032b949510ca724ed54d02e1255c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:26 -0800 Subject: arc: convert to dma_map_ops [vgupta@synopsys.com: ARC: dma mapping fixes #2] Signed-off-by: Christoph Hellwig Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Vineet Gupta Cc: Carlos Palminha Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 76dde9d..8150c27 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -38,6 +38,7 @@ config ARC select OF_EARLY_FLATTREE select PERF_USE_VMALLOC select HAVE_DEBUG_STACKOVERFLOW + select HAVE_DMA_ATTRS config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h index 2d28ba9..2a617f9 100644 --- a/arch/arc/include/asm/dma-mapping.h +++ b/arch/arc/include/asm/dma-mapping.h @@ -11,192 +11,13 @@ #ifndef ASM_ARC_DMA_MAPPING_H #define ASM_ARC_DMA_MAPPING_H -#include -#include +extern struct dma_map_ops arc_dma_ops; -void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp); - -void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle); - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp); - -void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, - dma_addr_t dma_handle); - -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) - -/* - * streaming DMA Mapping API... - * CPU accesses page via normal paddr, thus needs to explicitly made - * consistent before each use - */ - -static inline void __inline_dma_cache_sync(unsigned long paddr, size_t size, - enum dma_data_direction dir) -{ - switch (dir) { - case DMA_FROM_DEVICE: - dma_cache_inv(paddr, size); - break; - case DMA_TO_DEVICE: - dma_cache_wback(paddr, size); - break; - case DMA_BIDIRECTIONAL: - dma_cache_wback_inv(paddr, size); - break; - default: - pr_err("Invalid DMA dir [%d] for OP @ %lx\n", dir, paddr); - } -} - -void __arc_dma_cache_sync(unsigned long paddr, size_t size, - enum dma_data_direction dir); - -#define _dma_cache_sync(addr, sz, dir) \ -do { \ - if (__builtin_constant_p(dir)) \ - __inline_dma_cache_sync(addr, sz, dir); \ - else \ - __arc_dma_cache_sync(addr, sz, dir); \ -} \ -while (0); - -static inline dma_addr_t -dma_map_single(struct device *dev, void *cpu_addr, size_t size, - enum dma_data_direction dir) -{ - _dma_cache_sync((unsigned long)cpu_addr, size, dir); - return (dma_addr_t)cpu_addr; -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir) -{ -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - unsigned long paddr = page_to_phys(page) + offset; - return dma_map_single(dev, (void *)paddr, size, dir); -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir) -{ -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nents, i) - s->dma_address = dma_map_page(dev, sg_page(s), s->offset, - s->length, dir); - - return nents; -} - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nents, i) - dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir) -{ - _dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE); -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir) -{ - _dma_cache_sync(dma_handle, size, DMA_TO_DEVICE); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - _dma_cache_sync(dma_handle + offset, size, DMA_FROM_DEVICE); -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - _dma_cache_sync(dma_handle + offset, size, DMA_TO_DEVICE); + return &arc_dma_ops; } -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, - enum dma_data_direction dir) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nelems, i) - _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction dir) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nelems, i) - _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); -} - -static inline int dma_supported(struct device *dev, u64 dma_mask) -{ - /* Support 32 bit DMA mask exclusively */ - return dma_mask == DMA_BIT_MASK(32); -} - -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -static inline int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - - return 0; -} +#include #endif diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 29a46bb..01eaf88 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -17,18 +17,14 @@ */ #include -#include -#include #include #include -/* - * Helpers for Coherent DMA API. - */ -void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) + +static void *arc_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { - void *paddr; + void *paddr, *kvaddr; /* This is linear addr (0x8000_0000 based) */ paddr = alloc_pages_exact(size, gfp); @@ -38,22 +34,6 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size, /* This is bus address, platform dependent */ *dma_handle = (dma_addr_t)paddr; - return paddr; -} -EXPORT_SYMBOL(dma_alloc_noncoherent); - -void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) -{ - free_pages_exact((void *)dma_handle, size); -} -EXPORT_SYMBOL(dma_free_noncoherent); - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - void *paddr, *kvaddr; - /* * IOC relies on all data (even coherent DMA data) being in cache * Thus allocate normal cached memory @@ -65,22 +45,15 @@ void *dma_alloc_coherent(struct device *dev, size_t size, * -For coherent data, Read/Write to buffers terminate early in cache * (vs. always going to memory - thus are faster) */ - if (is_isa_arcv2() && ioc_exists) - return dma_alloc_noncoherent(dev, size, dma_handle, gfp); - - /* This is linear addr (0x8000_0000 based) */ - paddr = alloc_pages_exact(size, gfp); - if (!paddr) - return NULL; + if ((is_isa_arcv2() && ioc_exists) || + dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) + return paddr; /* This is kernel Virtual address (0x7000_0000 based) */ kvaddr = ioremap_nocache((unsigned long)paddr, size); if (kvaddr == NULL) return NULL; - /* This is bus address, platform dependent */ - *dma_handle = (dma_addr_t)paddr; - /* * Evict any existing L1 and/or L2 lines for the backing page * in case it was used earlier as a normal "cached" page. @@ -95,26 +68,111 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return kvaddr; } -EXPORT_SYMBOL(dma_alloc_coherent); -void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, - dma_addr_t dma_handle) +static void arc_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { - if (is_isa_arcv2() && ioc_exists) - return dma_free_noncoherent(dev, size, kvaddr, dma_handle); - - iounmap((void __force __iomem *)kvaddr); + if (!dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs) && + !(is_isa_arcv2() && ioc_exists)) + iounmap((void __force __iomem *)vaddr); free_pages_exact((void *)dma_handle, size); } -EXPORT_SYMBOL(dma_free_coherent); /* - * Helper for streaming DMA... + * streaming DMA Mapping API... + * CPU accesses page via normal paddr, thus needs to explicitly made + * consistent before each use */ -void __arc_dma_cache_sync(unsigned long paddr, size_t size, - enum dma_data_direction dir) +static void _dma_cache_sync(unsigned long paddr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_FROM_DEVICE: + dma_cache_inv(paddr, size); + break; + case DMA_TO_DEVICE: + dma_cache_wback(paddr, size); + break; + case DMA_BIDIRECTIONAL: + dma_cache_wback_inv(paddr, size); + break; + default: + pr_err("Invalid DMA dir [%d] for OP @ %lx\n", dir, paddr); + } +} + +static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unsigned long paddr = page_to_phys(page) + offset; + _dma_cache_sync(paddr, size, dir); + return (dma_addr_t)paddr; +} + +static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) + s->dma_address = dma_map_page(dev, sg_page(s), s->offset, + s->length, dir); + + return nents; +} + +static void arc_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) +{ + _dma_cache_sync(dma_handle, size, DMA_FROM_DEVICE); +} + +static void arc_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { - __inline_dma_cache_sync(paddr, size, dir); + _dma_cache_sync(dma_handle, size, DMA_TO_DEVICE); } -EXPORT_SYMBOL(__arc_dma_cache_sync); + +static void arc_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction dir) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) + _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); +} + +static void arc_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction dir) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) + _dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir); +} + +static int arc_dma_supported(struct device *dev, u64 dma_mask) +{ + /* Support 32 bit DMA mask exclusively */ + return dma_mask == DMA_BIT_MASK(32); +} + +struct dma_map_ops arc_dma_ops = { + .alloc = arc_dma_alloc, + .free = arc_dma_free, + .map_page = arc_dma_map_page, + .map_sg = arc_dma_map_sg, + .sync_single_for_device = arc_dma_sync_single_for_device, + .sync_single_for_cpu = arc_dma_sync_single_for_cpu, + .sync_sg_for_cpu = arc_dma_sync_sg_for_cpu, + .sync_sg_for_device = arc_dma_sync_sg_for_device, + .dma_supported = arc_dma_supported, +}; +EXPORT_SYMBOL(arc_dma_ops); -- cgit v0.10.2 From a34a517ac96c6910a3a0aab9513035bfbed0020c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:29 -0800 Subject: avr32: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index b6878eb..aac3d69 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -7,6 +7,7 @@ config AVR32 select HAVE_OPROFILE select HAVE_KPROBES select VIRT_TO_BUS + select HAVE_DMA_ATTRS select GENERIC_IRQ_PROBE select GENERIC_ATOMIC64 select HARDIRQS_SW_RESEND diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h index ae7ac92..0239ca8 100644 --- a/arch/avr32/include/asm/dma-mapping.h +++ b/arch/avr32/include/asm/dma-mapping.h @@ -1,350 +1,16 @@ #ifndef __ASM_AVR32_DMA_MAPPING_H #define __ASM_AVR32_DMA_MAPPING_H -#include -#include -#include -#include -#include -#include - extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size, int direction); -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask - * to this function. - */ -static inline int dma_supported(struct device *dev, u64 mask) -{ - /* Fix when needed. I really don't know of any limitations */ - return 1; -} - -static inline int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - return 0; -} - -/* - * dma_map_single can't fail as it is implemented now. - */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t addr) -{ - return 0; -} - -/** - * dma_alloc_coherent - allocate consistent memory for DMA - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @size: required memory size - * @handle: bus-specific DMA address - * - * Allocate some uncached, unbuffered memory for a device for - * performing DMA. This function allocates pages, and will - * return the CPU-viewed address, and sets @handle to be the - * device-viewed address. - */ -extern void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp); - -/** - * dma_free_coherent - free memory allocated by dma_alloc_coherent - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @size: size of memory originally requested in dma_alloc_coherent - * @cpu_addr: CPU-view address returned from dma_alloc_coherent - * @handle: device-view address returned from dma_alloc_coherent - * - * Free (and unmap) a DMA buffer previously allocated by - * dma_alloc_coherent(). - * - * References to memory and mappings associated with cpu_addr/handle - * during and after this call executing are illegal. - */ -extern void dma_free_coherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle); - -/** - * dma_alloc_writecombine - allocate write-combining memory for DMA - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @size: required memory size - * @handle: bus-specific DMA address - * - * Allocate some uncached, buffered memory for a device for - * performing DMA. This function allocates pages, and will - * return the CPU-viewed address, and sets @handle to be the - * device-viewed address. - */ -extern void *dma_alloc_writecombine(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp); - -/** - * dma_free_coherent - free memory allocated by dma_alloc_writecombine - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @size: size of memory originally requested in dma_alloc_writecombine - * @cpu_addr: CPU-view address returned from dma_alloc_writecombine - * @handle: device-view address returned from dma_alloc_writecombine - * - * Free (and unmap) a DMA buffer previously allocated by - * dma_alloc_writecombine(). - * - * References to memory and mappings associated with cpu_addr/handle - * during and after this call executing are illegal. - */ -extern void dma_free_writecombine(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle); - -/** - * dma_map_single - map a single buffer for streaming DMA - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @cpu_addr: CPU direct mapped address of buffer - * @size: size of buffer to map - * @dir: DMA transfer direction - * - * Ensure that any data held in the cache is appropriately discarded - * or written back. - * - * The device owns this memory once this call has completed. The CPU - * can regain ownership by calling dma_unmap_single() or dma_sync_single(). - */ -static inline dma_addr_t -dma_map_single(struct device *dev, void *cpu_addr, size_t size, - enum dma_data_direction direction) -{ - dma_cache_sync(dev, cpu_addr, size, direction); - return virt_to_bus(cpu_addr); -} - -/** - * dma_unmap_single - unmap a single buffer previously mapped - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @handle: DMA address of buffer - * @size: size of buffer to map - * @dir: DMA transfer direction - * - * Unmap a single streaming mode DMA translation. The handle and size - * must match what was provided in the previous dma_map_single() call. - * All other usages are undefined. - * - * After this call, reads by the CPU to the buffer are guaranteed to see - * whatever the device wrote there. - */ -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - -} - -/** - * dma_map_page - map a portion of a page for streaming DMA - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @page: page that buffer resides in - * @offset: offset into page for start of buffer - * @size: size of buffer to map - * @dir: DMA transfer direction - * - * Ensure that any data held in the cache is appropriately discarded - * or written back. - * - * The device owns this memory once this call has completed. The CPU - * can regain ownership by calling dma_unmap_page() or dma_sync_single(). - */ -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - return dma_map_single(dev, page_address(page) + offset, - size, direction); -} - -/** - * dma_unmap_page - unmap a buffer previously mapped through dma_map_page() - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @handle: DMA address of buffer - * @size: size of buffer to map - * @dir: DMA transfer direction - * - * Unmap a single streaming mode DMA translation. The handle and size - * must match what was provided in the previous dma_map_single() call. - * All other usages are undefined. - * - * After this call, reads by the CPU to the buffer are guaranteed to see - * whatever the device wrote there. - */ -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - dma_unmap_single(dev, dma_address, size, direction); -} - -/** - * dma_map_sg - map a set of SG buffers for streaming mode DMA - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @sg: list of buffers - * @nents: number of buffers to map - * @dir: DMA transfer direction - * - * Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nents, i) { - char *virt; - - sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; - virt = sg_virt(sg); - dma_cache_sync(dev, virt, sg->length, direction); - } - - return nents; -} - -/** - * dma_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @sg: list of buffers - * @nents: number of buffers to map - * @dir: DMA transfer direction - * - * Unmap a set of streaming mode DMA translations. - * Again, CPU read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - -} - -/** - * dma_sync_single_for_cpu - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @handle: DMA address of buffer - * @size: size of buffer to map - * @dir: DMA transfer direction - * - * Make physical memory consistent for a single streaming mode DMA - * translation after a transfer. - * - * If you perform a dma_map_single() but wish to interrogate the - * buffer using the cpu, yet do not wish to teardown the DMA mapping, - * you must call this function before doing so. At the next point you - * give the DMA address back to the card, you must first perform a - * dma_sync_single_for_device, and then the device again owns the - * buffer. - */ -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - /* - * No need to do anything since the CPU isn't supposed to - * touch this memory after we flushed it at mapping- or - * sync-for-device time. - */ -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - dma_cache_sync(dev, bus_to_virt(dma_handle), size, direction); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything, that's all the pci API can do */ - dma_sync_single_for_cpu(dev, dma_handle, offset+size, direction); -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything, that's all the pci API can do */ - dma_sync_single_for_device(dev, dma_handle, offset+size, direction); -} +extern struct dma_map_ops avr32_dma_ops; -/** - * dma_sync_sg_for_cpu - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @sg: list of buffers - * @nents: number of buffers to map - * @dir: DMA transfer direction - * - * Make physical memory consistent for a set of streaming - * mode DMA translations after a transfer. - * - * The same as dma_sync_single_for_* but for a scatter-gather list, - * same rules and usage. - */ -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - /* - * No need to do anything since the CPU isn't supposed to - * touch this memory after we flushed it at mapping- or - * sync-for-device time. - */ + return &avr32_dma_ops; } -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction direction) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nents, i) - dma_cache_sync(dev, sg_virt(sg), sg->length, direction); -} - -/* Now for the API extensions over the pci_ one */ - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) +#include #endif /* __ASM_AVR32_DMA_MAPPING_H */ diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c index 50cdb5b..92cf1fb 100644 --- a/arch/avr32/mm/dma-coherent.c +++ b/arch/avr32/mm/dma-coherent.c @@ -9,9 +9,14 @@ #include #include #include +#include +#include +#include -#include +#include #include +#include +#include void dma_cache_sync(struct device *dev, void *vaddr, size_t size, int direction) { @@ -93,60 +98,100 @@ static void __dma_free(struct device *dev, size_t size, __free_page(page++); } -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp) +static void *avr32_dma_alloc(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) { struct page *page; - void *ret = NULL; + dma_addr_t phys; page = __dma_alloc(dev, size, handle, gfp); - if (page) - ret = phys_to_uncached(page_to_phys(page)); + if (!page) + return NULL; + phys = page_to_phys(page); - return ret; + if (dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs)) { + /* Now, map the page into P3 with write-combining turned on */ + *handle = phys; + return __ioremap(phys, size, _PAGE_BUFFER); + } else { + return phys_to_uncached(phys); + } } -EXPORT_SYMBOL(dma_alloc_coherent); -void dma_free_coherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle) +static void avr32_dma_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t handle, struct dma_attrs *attrs) { - void *addr = phys_to_cached(uncached_to_phys(cpu_addr)); struct page *page; - pr_debug("dma_free_coherent addr %p (phys %08lx) size %u\n", - cpu_addr, (unsigned long)handle, (unsigned)size); - BUG_ON(!virt_addr_valid(addr)); - page = virt_to_page(addr); + if (dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs)) { + iounmap(cpu_addr); + + page = phys_to_page(handle); + } else { + void *addr = phys_to_cached(uncached_to_phys(cpu_addr)); + + pr_debug("avr32_dma_free addr %p (phys %08lx) size %u\n", + cpu_addr, (unsigned long)handle, (unsigned)size); + + BUG_ON(!virt_addr_valid(addr)); + page = virt_to_page(addr); + } + __dma_free(dev, size, page, handle); } -EXPORT_SYMBOL(dma_free_coherent); -void *dma_alloc_writecombine(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp) +static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { - struct page *page; - dma_addr_t phys; + void *cpu_addr = page_address(page) + offset; - page = __dma_alloc(dev, size, handle, gfp); - if (!page) - return NULL; + dma_cache_sync(dev, cpu_addr, size, direction); + return virt_to_bus(cpu_addr); +} - phys = page_to_phys(page); - *handle = phys; +static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nents, i) { + char *virt; - /* Now, map the page into P3 with write-combining turned on */ - return __ioremap(phys, size, _PAGE_BUFFER); + sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; + virt = sg_virt(sg); + dma_cache_sync(dev, virt, sg->length, direction); + } + + return nents; } -EXPORT_SYMBOL(dma_alloc_writecombine); -void dma_free_writecombine(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle) +static void avr32_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { - struct page *page; + dma_cache_sync(dev, bus_to_virt(dma_handle), size, direction); +} - iounmap(cpu_addr); +static void avr32_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nents, + enum dma_data_direction direction) +{ + int i; + struct scatterlist *sg; - page = phys_to_page(handle); - __dma_free(dev, size, page, handle); + for_each_sg(sglist, sg, nents, i) + dma_cache_sync(dev, sg_virt(sg), sg->length, direction); } -EXPORT_SYMBOL(dma_free_writecombine); + +struct dma_map_ops avr32_dma_ops = { + .alloc = avr32_dma_alloc, + .free = avr32_dma_free, + .map_page = avr32_dma_map_page, + .map_sg = avr32_dma_map_sg, + .sync_single_for_device = avr32_dma_sync_single_for_device, + .sync_sg_for_device = avr32_dma_sync_sg_for_device, +}; +EXPORT_SYMBOL(avr32_dma_ops); -- cgit v0.10.2 From 6f62097583e799040d6d18909b670b1e4dbb614d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:32 -0800 Subject: blackfin: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: Steven Miao Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index af76634..4be2f90 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -14,6 +14,7 @@ config BLACKFIN def_bool y select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_DMA_ATTRS select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h index 054d9ec..ea5a2e8 100644 --- a/arch/blackfin/include/asm/dma-mapping.h +++ b/arch/blackfin/include/asm/dma-mapping.h @@ -8,36 +8,6 @@ #define _BLACKFIN_DMA_MAPPING_H #include -struct scatterlist; - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle); - -/* - * Now for the API extensions over the pci_ one - */ -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) -#define dma_supported(d, m) (1) - -static inline int -dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - - return 0; -} - -static inline int -dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} extern void __dma_sync(dma_addr_t addr, size_t size, enum dma_data_direction dir); @@ -66,102 +36,13 @@ _dma_sync(dma_addr_t addr, size_t size, enum dma_data_direction dir) __dma_sync(addr, size, dir); } -static inline dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction dir) -{ - _dma_sync((dma_addr_t)ptr, size, dir); - return (dma_addr_t) ptr; -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - return dma_map_single(dev, page_address(page) + offset, size, dir); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction dir) -{ - BUG_ON(!valid_dma_direction(dir)); -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction dir) -{ - dma_unmap_single(dev, dma_addr, size, dir); -} - -extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, enum dma_data_direction dir) -{ - BUG_ON(!valid_dma_direction(dir)); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t handle, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - BUG_ON(!valid_dma_direction(dir)); -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t handle, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - _dma_sync(handle + offset, size, dir); -} +extern struct dma_map_ops bfin_dma_ops; -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, size_t size, - enum dma_data_direction dir) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - dma_sync_single_range_for_cpu(dev, handle, 0, size, dir); + return &bfin_dma_ops; } -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t handle, size_t size, - enum dma_data_direction dir) -{ - dma_sync_single_range_for_device(dev, handle, 0, size, dir); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir) -{ - BUG_ON(!valid_dma_direction(dir)); -} - -extern void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir); - -static inline void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - _dma_sync((dma_addr_t)vaddr, size, dir); -} - -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) +#include #endif /* _BLACKFIN_DMA_MAPPING_H */ diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c index df437e5..771afe6 100644 --- a/arch/blackfin/kernel/dma-mapping.c +++ b/arch/blackfin/kernel/dma-mapping.c @@ -78,8 +78,8 @@ static void __free_dma_pages(unsigned long addr, unsigned int pages) spin_unlock_irqrestore(&dma_page_lock, flags); } -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static void *bfin_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { void *ret; @@ -92,15 +92,12 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return ret; } -EXPORT_SYMBOL(dma_alloc_coherent); -void -dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +static void bfin_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { __free_dma_pages((unsigned long)vaddr, get_pages(size)); } -EXPORT_SYMBOL(dma_free_coherent); /* * Streaming DMA mappings @@ -112,9 +109,9 @@ void __dma_sync(dma_addr_t addr, size_t size, } EXPORT_SYMBOL(__dma_sync); -int -dma_map_sg(struct device *dev, struct scatterlist *sg_list, int nents, - enum dma_data_direction direction) +static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { struct scatterlist *sg; int i; @@ -126,10 +123,10 @@ dma_map_sg(struct device *dev, struct scatterlist *sg_list, int nents, return nents; } -EXPORT_SYMBOL(dma_map_sg); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg_list, - int nelems, enum dma_data_direction direction) +static void bfin_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg_list, int nelems, + enum dma_data_direction direction) { struct scatterlist *sg; int i; @@ -139,4 +136,31 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg_list, __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); } } -EXPORT_SYMBOL(dma_sync_sg_for_device); + +static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); + + _dma_sync(handle, size, dir); + return handle; +} + +static inline void bfin_dma_sync_single_for_device(struct device *dev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ + _dma_sync(handle, size, dir); +} + +struct dma_map_ops bfin_dma_ops = { + .alloc = bfin_dma_alloc, + .free = bfin_dma_free, + + .map_page = bfin_dma_map_page, + .map_sg = bfin_dma_map_sg, + + .sync_single_for_device = bfin_dma_sync_single_for_device, + .sync_sg_for_device = bfin_dma_sync_sg_for_device, +}; +EXPORT_SYMBOL(bfin_dma_ops); -- cgit v0.10.2 From 4605f04b2893fb5498b31c54e8f21da2fc4cc736 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:35 -0800 Subject: c6x: convert to dma_map_ops [dan.carpenter@oracle.com: C6X: fix build breakage] Signed-off-by: Christoph Hellwig Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 77ea09b..8602f72 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -17,6 +17,8 @@ config C6X select OF_EARLY_FLATTREE select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA + select ARCH_NO_COHERENT_DMA_MMAP + select HAVE_DMA_ATTRS config MMU def_bool n diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h index bbd7774..f881e42 100644 --- a/arch/c6x/include/asm/dma-mapping.h +++ b/arch/c6x/include/asm/dma-mapping.h @@ -12,104 +12,24 @@ #ifndef _ASM_C6X_DMA_MAPPING_H #define _ASM_C6X_DMA_MAPPING_H -#include -#include - -#define dma_supported(d, m) 1 - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ -} - -static inline int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - - return 0; -} - /* * DMA errors are defined by all-bits-set in the DMA address. */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - debug_dma_mapping_error(dev, dma_addr); - return dma_addr == ~0; -} - -extern dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, - size_t size, enum dma_data_direction dir); - -extern void dma_unmap_single(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir); - -extern int dma_map_sg(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction direction); - -extern void dma_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction direction); +#define DMA_ERROR_CODE ~0 -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir) -{ - dma_addr_t handle; - - handle = dma_map_single(dev, page_address(page) + offset, size, dir); - - debug_dma_map_page(dev, page, offset, size, dir, handle, false); - - return handle; -} +extern struct dma_map_ops c6x_dma_ops; -static inline void dma_unmap_page(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - dma_unmap_single(dev, handle, size, dir); - - debug_dma_unmap_page(dev, handle, size, dir, false); + return &c6x_dma_ops; } -extern void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir); - -extern void dma_sync_single_for_device(struct device *dev, dma_addr_t handle, - size_t size, - enum dma_data_direction dir); - -extern void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir); - -extern void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir); +#include extern void coherent_mem_init(u32 start, u32 size); -extern void *dma_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t); -extern void dma_free_coherent(struct device *, size_t, void *, dma_addr_t); - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent((d), (s), (h), (f)) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent((d), (s), (v), (h)) - -/* Not supported for now */ -static inline int dma_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size) -{ - return -EINVAL; -} - -static inline int dma_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size) -{ - return -EINVAL; -} +void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp, struct dma_attrs *attrs); +void c6x_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs); #endif /* _ASM_C6X_DMA_MAPPING_H */ diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c index ab7b12d..8a80f3a 100644 --- a/arch/c6x/kernel/dma.c +++ b/arch/c6x/kernel/dma.c @@ -36,110 +36,101 @@ static void c6x_dma_sync(dma_addr_t handle, size_t size, } } -dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction dir) +static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { - dma_addr_t addr = virt_to_phys(ptr); + dma_addr_t handle = virt_to_phys(page_address(page) + offset); - c6x_dma_sync(addr, size, dir); - - debug_dma_map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, - dir, addr, true); - return addr; + c6x_dma_sync(handle, size, dir); + return handle; } -EXPORT_SYMBOL(dma_map_single); - -void dma_unmap_single(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) +static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { c6x_dma_sync(handle, size, dir); - - debug_dma_unmap_page(dev, handle, size, dir, true); } -EXPORT_SYMBOL(dma_unmap_single); - -int dma_map_sg(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction dir) +static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { struct scatterlist *sg; int i; - for_each_sg(sglist, sg, nents, i) - sg->dma_address = dma_map_single(dev, sg_virt(sg), sg->length, - dir); - - debug_dma_map_sg(dev, sglist, nents, nents, dir); + for_each_sg(sglist, sg, nents, i) { + sg->dma_address = sg_phys(sg); + c6x_dma_sync(sg->dma_address, sg->length, dir); + } return nents; } -EXPORT_SYMBOL(dma_map_sg); - -void dma_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction dir) +static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) { struct scatterlist *sg; int i; for_each_sg(sglist, sg, nents, i) - dma_unmap_single(dev, sg_dma_address(sg), sg->length, dir); + c6x_dma_sync(sg_dma_address(sg), sg->length, dir); - debug_dma_unmap_sg(dev, sglist, nents, dir); } -EXPORT_SYMBOL(dma_unmap_sg); -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) +static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir) { c6x_dma_sync(handle, size, dir); - debug_dma_sync_single_for_cpu(dev, handle, size, dir); } -EXPORT_SYMBOL(dma_sync_single_for_cpu); - -void dma_sync_single_for_device(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) +static void c6x_dma_sync_single_for_device(struct device *dev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { c6x_dma_sync(handle, size, dir); - debug_dma_sync_single_for_device(dev, handle, size, dir); } -EXPORT_SYMBOL(dma_sync_single_for_device); - -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction dir) +static void c6x_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sglist, int nents, + enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sglist, sg, nents, i) - dma_sync_single_for_cpu(dev, sg_dma_address(sg), + c6x_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); - debug_dma_sync_sg_for_cpu(dev, sglist, nents, dir); } -EXPORT_SYMBOL(dma_sync_sg_for_cpu); - -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction dir) +static void c6x_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nents, + enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sglist, sg, nents, i) - dma_sync_single_for_device(dev, sg_dma_address(sg), + c6x_dma_sync_single_for_device(dev, sg_dma_address(sg), sg->length, dir); - debug_dma_sync_sg_for_device(dev, sglist, nents, dir); } -EXPORT_SYMBOL(dma_sync_sg_for_device); +struct dma_map_ops c6x_dma_ops = { + .alloc = c6x_dma_alloc, + .free = c6x_dma_free, + .map_page = c6x_dma_map_page, + .unmap_page = c6x_dma_unmap_page, + .map_sg = c6x_dma_map_sg, + .unmap_sg = c6x_dma_unmap_sg, + .sync_single_for_device = c6x_dma_sync_single_for_device, + .sync_single_for_cpu = c6x_dma_sync_single_for_cpu, + .sync_sg_for_device = c6x_dma_sync_sg_for_device, + .sync_sg_for_cpu = c6x_dma_sync_sg_for_cpu, +}; +EXPORT_SYMBOL(c6x_dma_ops); /* Number of entries preallocated for DMA-API debugging */ #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) diff --git a/arch/c6x/mm/dma-coherent.c b/arch/c6x/mm/dma-coherent.c index 4187e51..f7ee63a 100644 --- a/arch/c6x/mm/dma-coherent.c +++ b/arch/c6x/mm/dma-coherent.c @@ -73,8 +73,8 @@ static void __free_dma_pages(u32 addr, int order) * Allocate DMA coherent memory space and return both the kernel * virtual and DMA address for that space. */ -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp) +void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp, struct dma_attrs *attrs) { u32 paddr; int order; @@ -94,13 +94,12 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return phys_to_virt(paddr); } -EXPORT_SYMBOL(dma_alloc_coherent); /* * Free DMA coherent memory as defined by the above mapping. */ -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +void c6x_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { int order; @@ -111,7 +110,6 @@ void dma_free_coherent(struct device *dev, size_t size, void *vaddr, __free_dma_pages(virt_to_phys(vaddr), order); } -EXPORT_SYMBOL(dma_free_coherent); /* * Initialise the coherent DMA memory allocator using the given uncached region. -- cgit v0.10.2 From e20dd88995dffe262934f355b3e96daa2458b331 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:38 -0800 Subject: cris: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index e086f9e..20d919c 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -54,6 +54,7 @@ config CRIS select GENERIC_ATOMIC64 select HAVE_UID16 select VIRT_TO_BUS + select HAVE_DMA_ATTRS select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW select GENERIC_IOMAP diff --git a/arch/cris/arch-v32/drivers/pci/dma.c b/arch/cris/arch-v32/drivers/pci/dma.c index ee55578..8d5efa5 100644 --- a/arch/cris/arch-v32/drivers/pci/dma.c +++ b/arch/cris/arch-v32/drivers/pci/dma.c @@ -16,21 +16,18 @@ #include #include -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static void *v32_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { void *ret; - int order = get_order(size); + /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); - if (dma_alloc_from_coherent(dev, size, dma_handle, &ret)) - return ret; - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) gfp |= GFP_DMA; - ret = (void *)__get_free_pages(gfp, order); + ret = (void *)__get_free_pages(gfp, get_order(size)); if (ret != NULL) { memset(ret, 0, size); @@ -39,12 +36,45 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return ret; } -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +static void v32_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + +static inline dma_addr_t v32_dma_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) { - int order = get_order(size); + return page_to_phys(page) + offset; +} - if (!dma_release_from_coherent(dev, order, vaddr)) - free_pages((unsigned long)vaddr, order); +static inline int v32_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + printk("Map sg\n"); + return nents; +} + +static inline int v32_dma_supported(struct device *dev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + */ + if (mask < 0x00ffffff) + return 0; + return 1; } +struct dma_map_ops v32_dma_ops = { + .alloc = v32_dma_alloc, + .free = v32_dma_free, + .map_page = v32_dma_map_page, + .map_sg = v32_dma_map_sg, + .dma_supported = v32_dma_supported, +}; +EXPORT_SYMBOL(v32_dma_ops); diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h index 57f794e..34e7c7c7 100644 --- a/arch/cris/include/asm/dma-mapping.h +++ b/arch/cris/include/asm/dma-mapping.h @@ -1,156 +1,22 @@ -/* DMA mapping. Nothing tricky here, just virt_to_phys */ - #ifndef _ASM_CRIS_DMA_MAPPING_H #define _ASM_CRIS_DMA_MAPPING_H -#include -#include -#include - -#include -#include - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - #ifdef CONFIG_PCI -#include - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); +extern struct dma_map_ops v32_dma_ops; -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); -#else -static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - BUG(); - return NULL; + return &v32_dma_ops; } - -static inline void -dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) +#else +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - BUG(); + BUG(); + return NULL; } #endif -static inline dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - return virt_to_phys(ptr); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - printk("Map sg\n"); - return nents; -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - return page_to_phys(page) + offset; -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ -} - -static inline int -dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if(mask < 0x00ffffff) - return 0; - - return 1; -} - -static inline int -dma_set_mask(struct device *dev, u64 mask) -{ - if(!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} +#include static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, @@ -158,15 +24,4 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size, { } -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) - - #endif -- cgit v0.10.2 From 5a1a67f1d7fef42eaa5a4cc3d48094fbec75d685 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:41 -0800 Subject: nios2: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: Ley Foon Tan Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4375554..4b2504d 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,6 +16,7 @@ config NIOS2 select SOC_BUS select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT + select HAVE_DMA_ATTRS config GENERIC_CSUM def_bool y diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h index b556723..bec8ac8 100644 --- a/arch/nios2/include/asm/dma-mapping.h +++ b/arch/nios2/include/asm/dma-mapping.h @@ -10,131 +10,20 @@ #ifndef _ASM_NIOS2_DMA_MAPPING_H #define _ASM_NIOS2_DMA_MAPPING_H -#include -#include -#include +extern struct dma_map_ops nios2_dma_ops; -static inline void __dma_sync_for_device(void *vaddr, size_t size, - enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - switch (direction) { - case DMA_FROM_DEVICE: - invalidate_dcache_range((unsigned long)vaddr, - (unsigned long)(vaddr + size)); - break; - case DMA_TO_DEVICE: - /* - * We just need to flush the caches here , but Nios2 flush - * instruction will do both writeback and invalidate. - */ - case DMA_BIDIRECTIONAL: /* flush and invalidate */ - flush_dcache_range((unsigned long)vaddr, - (unsigned long)(vaddr + size)); - break; - default: - BUG(); - } -} - -static inline void __dma_sync_for_cpu(void *vaddr, size_t size, - enum dma_data_direction direction) -{ - switch (direction) { - case DMA_BIDIRECTIONAL: - case DMA_FROM_DEVICE: - invalidate_dcache_range((unsigned long)vaddr, - (unsigned long)(vaddr + size)); - break; - case DMA_TO_DEVICE: - break; - default: - BUG(); - } -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - -static inline dma_addr_t dma_map_single(struct device *dev, void *ptr, - size_t size, - enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - __dma_sync_for_device(ptr, size, direction); - return virt_to_phys(ptr); -} - -static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction) -{ -} - -extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction); -extern dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction direction); -extern void dma_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, enum dma_data_direction direction); -extern void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, enum dma_data_direction direction); -extern void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction); -extern void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction direction); -extern void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction); -extern void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction); -extern void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction); -extern void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction); - -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 1; -} - -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} - -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; + return &nios2_dma_ops; } /* -* dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to -* do any flushing here. -*/ + * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to + * do any flushing here. + */ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) { } -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) - #endif /* _ASM_NIOS2_DMA_MAPPING_H */ diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c index ac5da75..90422c3 100644 --- a/arch/nios2/mm/dma-mapping.c +++ b/arch/nios2/mm/dma-mapping.c @@ -20,9 +20,46 @@ #include #include +static inline void __dma_sync_for_device(void *vaddr, size_t size, + enum dma_data_direction direction) +{ + switch (direction) { + case DMA_FROM_DEVICE: + invalidate_dcache_range((unsigned long)vaddr, + (unsigned long)(vaddr + size)); + break; + case DMA_TO_DEVICE: + /* + * We just need to flush the caches here , but Nios2 flush + * instruction will do both writeback and invalidate. + */ + case DMA_BIDIRECTIONAL: /* flush and invalidate */ + flush_dcache_range((unsigned long)vaddr, + (unsigned long)(vaddr + size)); + break; + default: + BUG(); + } +} -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static inline void __dma_sync_for_cpu(void *vaddr, size_t size, + enum dma_data_direction direction) +{ + switch (direction) { + case DMA_BIDIRECTIONAL: + case DMA_FROM_DEVICE: + invalidate_dcache_range((unsigned long)vaddr, + (unsigned long)(vaddr + size)); + break; + case DMA_TO_DEVICE: + break; + default: + BUG(); + } +} + +static void *nios2_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { void *ret; @@ -45,24 +82,21 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return ret; } -EXPORT_SYMBOL(dma_alloc_coherent); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +static void nios2_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { unsigned long addr = (unsigned long) CAC_ADDR((unsigned long) vaddr); free_pages(addr, get_order(size)); } -EXPORT_SYMBOL(dma_free_coherent); -int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) +static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { int i; - BUG_ON(!valid_dma_direction(direction)); - for_each_sg(sg, sg, nents, i) { void *addr; @@ -75,40 +109,32 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return nents; } -EXPORT_SYMBOL(dma_map_sg); -dma_addr_t dma_map_page(struct device *dev, struct page *page, +static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, - enum dma_data_direction direction) + enum dma_data_direction direction, + struct dma_attrs *attrs) { - void *addr; - - BUG_ON(!valid_dma_direction(direction)); + void *addr = page_address(page) + offset; - addr = page_address(page) + offset; __dma_sync_for_device(addr, size, direction); - return page_to_phys(page) + offset; } -EXPORT_SYMBOL(dma_map_page); -void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) +static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) { - BUG_ON(!valid_dma_direction(direction)); - __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); } -EXPORT_SYMBOL(dma_unmap_page); -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) +static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nhwentries, enum dma_data_direction direction, + struct dma_attrs *attrs) { void *addr; int i; - BUG_ON(!valid_dma_direction(direction)); - if (direction == DMA_TO_DEVICE) return; @@ -118,69 +144,54 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, __dma_sync_for_cpu(addr, sg->length, direction); } } -EXPORT_SYMBOL(dma_unmap_sg); - -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - __dma_sync_for_cpu(phys_to_virt(dma_handle), size, direction); -} -EXPORT_SYMBOL(dma_sync_single_for_cpu); - -void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - - __dma_sync_for_device(phys_to_virt(dma_handle), size, direction); -} -EXPORT_SYMBOL(dma_sync_single_for_device); - -void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static void nios2_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { - BUG_ON(!valid_dma_direction(direction)); - __dma_sync_for_cpu(phys_to_virt(dma_handle), size, direction); } -EXPORT_SYMBOL(dma_sync_single_range_for_cpu); -void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static void nios2_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { - BUG_ON(!valid_dma_direction(direction)); - __dma_sync_for_device(phys_to_virt(dma_handle), size, direction); } -EXPORT_SYMBOL(dma_sync_single_range_for_device); -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +static void nios2_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction direction) { int i; - BUG_ON(!valid_dma_direction(direction)); - /* Make sure that gcc doesn't leave the empty loop body. */ for_each_sg(sg, sg, nelems, i) __dma_sync_for_cpu(sg_virt(sg), sg->length, direction); } -EXPORT_SYMBOL(dma_sync_sg_for_cpu); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction) +static void nios2_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction direction) { int i; - BUG_ON(!valid_dma_direction(direction)); - /* Make sure that gcc doesn't leave the empty loop body. */ for_each_sg(sg, sg, nelems, i) __dma_sync_for_device(sg_virt(sg), sg->length, direction); } -EXPORT_SYMBOL(dma_sync_sg_for_device); + +struct dma_map_ops nios2_dma_ops = { + .alloc = nios2_dma_alloc, + .free = nios2_dma_free, + .map_page = nios2_dma_map_page, + .unmap_page = nios2_dma_unmap_page, + .map_sg = nios2_dma_map_sg, + .unmap_sg = nios2_dma_unmap_sg, + .sync_single_for_device = nios2_dma_sync_single_for_device, + .sync_single_for_cpu = nios2_dma_sync_single_for_cpu, + .sync_sg_for_cpu = nios2_dma_sync_sg_for_cpu, + .sync_sg_for_device = nios2_dma_sync_sg_for_device, +}; +EXPORT_SYMBOL(nios2_dma_ops); -- cgit v0.10.2 From eae075196305549513335c2fc7d5d63712246bfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:44 -0800 Subject: frv: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: David Howells Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 03bfd6b..e383781 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -15,6 +15,8 @@ config FRV select OLD_SIGSUSPEND3 select OLD_SIGACTION select HAVE_DEBUG_STACKOVERFLOW + select ARCH_NO_COHERENT_DMA_MMAP + select HAVE_DMA_ATTRS config ZONE_DMA bool diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h index 2840adc..750951c 100644 --- a/arch/frv/include/asm/dma-mapping.h +++ b/arch/frv/include/asm/dma-mapping.h @@ -1,128 +1,17 @@ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#include -#include #include #include -#include - -/* - * See Documentation/DMA-API.txt for the description of how the - * following DMA API should work. - */ - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) extern unsigned long __nongprelbss dma_coherent_mem_start; extern unsigned long __nongprelbss dma_coherent_mem_end; -void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); - -extern dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction); - -static inline -void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction); - -static inline -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -extern -dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction); - -static inline -void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - - -static inline -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline -void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} +extern struct dma_map_ops frv_dma_ops; -static inline -void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { -} - -static inline -void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -static inline -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ -} - -static inline -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -static inline -int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -static inline -int dma_supported(struct device *dev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if (mask < 0x00ffffff) - return 0; - - return 1; -} - -static inline -int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; + return &frv_dma_ops; } static inline @@ -132,19 +21,6 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, flush_write_buffers(); } -/* Not supported for now */ -static inline int dma_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size) -{ - return -EINVAL; -} - -static inline int dma_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size) -{ - return -EINVAL; -} +#include #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index 8eeea0d..082be49 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c @@ -34,7 +34,8 @@ struct dma_alloc_record { static DEFINE_SPINLOCK(dma_alloc_lock); static LIST_HEAD(dma_alloc_list); -void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) +static void *frv_dma_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) { struct dma_alloc_record *new; struct list_head *this = &dma_alloc_list; @@ -84,9 +85,8 @@ void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_hand return NULL; } -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) +static void frv_dma_free(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { struct dma_alloc_record *rec; unsigned long flags; @@ -105,22 +105,9 @@ void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_ BUG(); } -EXPORT_SYMBOL(dma_free_coherent); - -dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - - frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size); - - return virt_to_bus(ptr); -} - -EXPORT_SYMBOL(dma_map_single); - -int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) +static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { int i; struct scatterlist *sg; @@ -135,14 +122,49 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, return nents; } -EXPORT_SYMBOL(dma_map_sg); - -dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) +static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { BUG_ON(direction == DMA_NONE); flush_dcache_page(page); return (dma_addr_t) page_to_phys(page) + offset; } -EXPORT_SYMBOL(dma_map_page); +static void frv_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + flush_write_buffers(); +} + +static void frv_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + flush_write_buffers(); +} + + +static int frv_dma_supported(struct device *dev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + */ + if (mask < 0x00ffffff) + return 0; + return 1; +} + +struct dma_map_ops frv_dma_ops = { + .alloc = frv_dma_alloc, + .free = frv_dma_free, + .map_page = frv_dma_map_page, + .map_sg = frv_dma_map_sg, + .sync_single_for_device = frv_dma_sync_single_for_device, + .sync_sg_for_device = frv_dma_sync_sg_for_device, + .dma_supported = frv_dma_supported, +}; +EXPORT_SYMBOL(frv_dma_ops); diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index 4d1f01d..316b7b6 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c @@ -18,7 +18,9 @@ #include #include -void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) +static void *frv_dma_alloc(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { void *ret; @@ -29,29 +31,15 @@ void *dma_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_hand return ret; } -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) +static void frv_dma_free(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { consistent_free(vaddr); } -EXPORT_SYMBOL(dma_free_coherent); - -dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - - frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size); - - return virt_to_bus(ptr); -} - -EXPORT_SYMBOL(dma_map_single); - -int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) +static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { unsigned long dampr2; void *vaddr; @@ -79,14 +67,48 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, return nents; } -EXPORT_SYMBOL(dma_map_sg); - -dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) +static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { - BUG_ON(direction == DMA_NONE); flush_dcache_page(page); return (dma_addr_t) page_to_phys(page) + offset; } -EXPORT_SYMBOL(dma_map_page); +static void frv_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + flush_write_buffers(); +} + +static void frv_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + flush_write_buffers(); +} + + +static int frv_dma_supported(struct device *dev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + */ + if (mask < 0x00ffffff) + return 0; + return 1; +} + +struct dma_map_ops frv_dma_ops = { + .alloc = frv_dma_alloc, + .free = frv_dma_free, + .map_page = frv_dma_map_page, + .map_sg = frv_dma_map_sg, + .sync_single_for_device = frv_dma_sync_single_for_device, + .sync_sg_for_device = frv_dma_sync_sg_for_device, + .dma_supported = frv_dma_supported, +}; +EXPORT_SYMBOL(frv_dma_ops); -- cgit v0.10.2 From 79387179e2e4fede52326e4c4e26145dbd6b505c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:47 -0800 Subject: parisc: convert to dma_map_ops Signed-off-by: Christoph Hellwig Tested-by: Helge Deller Acked-by: Helge Deller Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 7c34caf..1489351 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -29,6 +29,8 @@ config PARISC select TTY # Needed for pdc_cons.c select HAVE_DEBUG_STACKOVERFLOW select HAVE_ARCH_AUDITSYSCALL + select ARCH_NO_COHERENT_DMA_MMAP + select HAVE_DMA_ATTRS help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index d8d60a5..4de5186 100644 --- a/arch/parisc/include/asm/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h @@ -1,30 +1,11 @@ #ifndef _PARISC_DMA_MAPPING_H #define _PARISC_DMA_MAPPING_H -#include -#include #include -/* See Documentation/DMA-API-HOWTO.txt */ -struct hppa_dma_ops { - int (*dma_supported)(struct device *dev, u64 mask); - void *(*alloc_consistent)(struct device *dev, size_t size, dma_addr_t *iova, gfp_t flag); - void *(*alloc_noncoherent)(struct device *dev, size_t size, dma_addr_t *iova, gfp_t flag); - void (*free_consistent)(struct device *dev, size_t size, void *vaddr, dma_addr_t iova); - dma_addr_t (*map_single)(struct device *dev, void *addr, size_t size, enum dma_data_direction direction); - void (*unmap_single)(struct device *dev, dma_addr_t iova, size_t size, enum dma_data_direction direction); - int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); - void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nhwents, enum dma_data_direction direction); - void (*dma_sync_single_for_cpu)(struct device *dev, dma_addr_t iova, unsigned long offset, size_t size, enum dma_data_direction direction); - void (*dma_sync_single_for_device)(struct device *dev, dma_addr_t iova, unsigned long offset, size_t size, enum dma_data_direction direction); - void (*dma_sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction); - void (*dma_sync_sg_for_device)(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction); -}; - /* -** We could live without the hppa_dma_ops indirection if we didn't want -** to support 4 different coherent dma models with one binary (they will -** someday be loadable modules): +** We need to support 4 different coherent dma models with one binary: +** ** I/O MMU consistent method dma_sync behavior ** ============= ====================== ======================= ** a) PA-7x00LC uncachable host memory flush/purge @@ -40,158 +21,22 @@ struct hppa_dma_ops { */ #ifdef CONFIG_PA11 -extern struct hppa_dma_ops pcxl_dma_ops; -extern struct hppa_dma_ops pcx_dma_ops; +extern struct dma_map_ops pcxl_dma_ops; +extern struct dma_map_ops pcx_dma_ops; #endif -extern struct hppa_dma_ops *hppa_dma_ops; - -#define dma_alloc_attrs(d, s, h, f, a) dma_alloc_coherent(d, s, h, f) -#define dma_free_attrs(d, s, h, f, a) dma_free_coherent(d, s, h, f) - -static inline void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) -{ - return hppa_dma_ops->alloc_consistent(dev, size, dma_handle, flag); -} - -static inline void * -dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag) -{ - return hppa_dma_ops->alloc_noncoherent(dev, size, dma_handle, flag); -} - -static inline void -dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - hppa_dma_ops->free_consistent(dev, size, vaddr, dma_handle); -} - -static inline void -dma_free_noncoherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - hppa_dma_ops->free_consistent(dev, size, vaddr, dma_handle); -} - -static inline dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - return hppa_dma_ops->map_single(dev, ptr, size, direction); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - hppa_dma_ops->unmap_single(dev, dma_addr, size, direction); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - return hppa_dma_ops->map_sg(dev, sg, nents, direction); -} - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - hppa_dma_ops->unmap_sg(dev, sg, nhwentries, direction); -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - return dma_map_single(dev, (page_address(page) + (offset)), size, direction); -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - dma_unmap_single(dev, dma_address, size, direction); -} - - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - if(hppa_dma_ops->dma_sync_single_for_cpu) - hppa_dma_ops->dma_sync_single_for_cpu(dev, dma_handle, 0, size, direction); -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - if(hppa_dma_ops->dma_sync_single_for_device) - hppa_dma_ops->dma_sync_single_for_device(dev, dma_handle, 0, size, direction); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - if(hppa_dma_ops->dma_sync_single_for_cpu) - hppa_dma_ops->dma_sync_single_for_cpu(dev, dma_handle, offset, size, direction); -} +extern struct dma_map_ops *hppa_dma_ops; -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - if(hppa_dma_ops->dma_sync_single_for_device) - hppa_dma_ops->dma_sync_single_for_device(dev, dma_handle, offset, size, direction); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - if(hppa_dma_ops->dma_sync_sg_for_cpu) - hppa_dma_ops->dma_sync_sg_for_cpu(dev, sg, nelems, direction); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - if(hppa_dma_ops->dma_sync_sg_for_device) - hppa_dma_ops->dma_sync_sg_for_device(dev, sg, nelems, direction); -} - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - return hppa_dma_ops->dma_supported(dev, mask); -} - -static inline int -dma_set_mask(struct device *dev, u64 mask) -{ - if(!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; + return hppa_dma_ops; } static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) { - if(hppa_dma_ops->dma_sync_single_for_cpu) + if (hppa_dma_ops->sync_single_for_cpu) flush_kernel_dcache_range((unsigned long)vaddr, size); } @@ -238,22 +83,6 @@ struct parisc_device; void * sba_get_iommu(struct parisc_device *dev); #endif -/* At the moment, we panic on error for IOMMU resource exaustion */ -#define dma_mapping_error(dev, x) 0 - -/* This API cannot be supported on PA-RISC */ -static inline int dma_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size) -{ - return -EINVAL; -} - -static inline int dma_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size) -{ - return -EINVAL; -} +#include #endif diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index dba508f..f815066 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -40,7 +40,7 @@ #include /* See comments in include/asm-parisc/pci.h */ -struct hppa_dma_ops *hppa_dma_ops __read_mostly; +struct dma_map_ops *hppa_dma_ops __read_mostly; EXPORT_SYMBOL(hppa_dma_ops); static struct device root = { diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index b9402c9..a27e492 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -413,7 +413,8 @@ pcxl_dma_init(void) __initcall(pcxl_dma_init); -static void * pa11_dma_alloc_consistent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) +static void *pa11_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { unsigned long vaddr; unsigned long paddr; @@ -439,7 +440,8 @@ static void * pa11_dma_alloc_consistent (struct device *dev, size_t size, dma_ad return (void *)vaddr; } -static void pa11_dma_free_consistent (struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) +static void pa11_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { int order; @@ -450,15 +452,20 @@ static void pa11_dma_free_consistent (struct device *dev, size_t size, void *vad free_pages((unsigned long)__va(dma_handle), order); } -static dma_addr_t pa11_dma_map_single(struct device *dev, void *addr, size_t size, enum dma_data_direction direction) +static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { + void *addr = page_address(page) + offset; BUG_ON(direction == DMA_NONE); flush_kernel_dcache_range((unsigned long) addr, size); return virt_to_phys(addr); } -static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) +static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) { BUG_ON(direction == DMA_NONE); @@ -475,7 +482,9 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz return; } -static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) +static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { int i; struct scatterlist *sg; @@ -492,7 +501,9 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n return nents; } -static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) +static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) { int i; struct scatterlist *sg; @@ -509,18 +520,24 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in return; } -static void pa11_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) +static void pa11_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); - flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle) + offset, size); + flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), + size); } -static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) +static void pa11_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { BUG_ON(direction == DMA_NONE); - flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle) + offset, size); + flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), + size); } static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction) @@ -545,32 +562,28 @@ static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist * flush_kernel_vmap_range(sg_virt(sg), sg->length); } -struct hppa_dma_ops pcxl_dma_ops = { +struct dma_map_ops pcxl_dma_ops = { .dma_supported = pa11_dma_supported, - .alloc_consistent = pa11_dma_alloc_consistent, - .alloc_noncoherent = pa11_dma_alloc_consistent, - .free_consistent = pa11_dma_free_consistent, - .map_single = pa11_dma_map_single, - .unmap_single = pa11_dma_unmap_single, + .alloc = pa11_dma_alloc, + .free = pa11_dma_free, + .map_page = pa11_dma_map_page, + .unmap_page = pa11_dma_unmap_page, .map_sg = pa11_dma_map_sg, .unmap_sg = pa11_dma_unmap_sg, - .dma_sync_single_for_cpu = pa11_dma_sync_single_for_cpu, - .dma_sync_single_for_device = pa11_dma_sync_single_for_device, - .dma_sync_sg_for_cpu = pa11_dma_sync_sg_for_cpu, - .dma_sync_sg_for_device = pa11_dma_sync_sg_for_device, + .sync_single_for_cpu = pa11_dma_sync_single_for_cpu, + .sync_single_for_device = pa11_dma_sync_single_for_device, + .sync_sg_for_cpu = pa11_dma_sync_sg_for_cpu, + .sync_sg_for_device = pa11_dma_sync_sg_for_device, }; -static void *fail_alloc_consistent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - return NULL; -} - -static void *pa11_dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) +static void *pcx_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { void *addr; + if (!dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) + return NULL; + addr = (void *)__get_free_pages(flag, get_order(size)); if (addr) *dma_handle = (dma_addr_t)virt_to_phys(addr); @@ -578,24 +591,23 @@ static void *pa11_dma_alloc_noncoherent(struct device *dev, size_t size, return addr; } -static void pa11_dma_free_noncoherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t iova) +static void pcx_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t iova, struct dma_attrs *attrs) { free_pages((unsigned long)vaddr, get_order(size)); return; } -struct hppa_dma_ops pcx_dma_ops = { +struct dma_map_ops pcx_dma_ops = { .dma_supported = pa11_dma_supported, - .alloc_consistent = fail_alloc_consistent, - .alloc_noncoherent = pa11_dma_alloc_noncoherent, - .free_consistent = pa11_dma_free_noncoherent, - .map_single = pa11_dma_map_single, - .unmap_single = pa11_dma_unmap_single, + .alloc = pcx_dma_alloc, + .free = pcx_dma_free, + .map_page = pa11_dma_map_page, + .unmap_page = pa11_dma_unmap_page, .map_sg = pa11_dma_map_sg, .unmap_sg = pa11_dma_unmap_sg, - .dma_sync_single_for_cpu = pa11_dma_sync_single_for_cpu, - .dma_sync_single_for_device = pa11_dma_sync_single_for_device, - .dma_sync_sg_for_cpu = pa11_dma_sync_sg_for_cpu, - .dma_sync_sg_for_device = pa11_dma_sync_sg_for_device, + .sync_single_for_cpu = pa11_dma_sync_single_for_cpu, + .sync_single_for_device = pa11_dma_sync_single_for_device, + .sync_sg_for_cpu = pa11_dma_sync_sg_for_cpu, + .sync_sg_for_device = pa11_dma_sync_sg_for_device, }; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 8e11fb2..e24b059 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -786,18 +786,27 @@ ccio_map_single(struct device *dev, void *addr, size_t size, return CCIO_IOVA(iovp, offset); } + +static dma_addr_t +ccio_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return ccio_map_single(dev, page_address(page) + offset, size, + direction); +} + + /** - * ccio_unmap_single - Unmap an address range from the IOMMU. + * ccio_unmap_page - Unmap an address range from the IOMMU. * @dev: The PCI device. * @addr: The start address of the DMA region. * @size: The length of the DMA region. * @direction: The direction of the DMA transaction (to/from device). - * - * This function implements the pci_unmap_single function. */ static void -ccio_unmap_single(struct device *dev, dma_addr_t iova, size_t size, - enum dma_data_direction direction) +ccio_unmap_page(struct device *dev, dma_addr_t iova, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; unsigned long flags; @@ -826,7 +835,7 @@ ccio_unmap_single(struct device *dev, dma_addr_t iova, size_t size, } /** - * ccio_alloc_consistent - Allocate a consistent DMA mapping. + * ccio_alloc - Allocate a consistent DMA mapping. * @dev: The PCI device. * @size: The length of the DMA region. * @dma_handle: The DMA address handed back to the device (not the cpu). @@ -834,7 +843,8 @@ ccio_unmap_single(struct device *dev, dma_addr_t iova, size_t size, * This function implements the pci_alloc_consistent function. */ static void * -ccio_alloc_consistent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) +ccio_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) { void *ret; #if 0 @@ -858,7 +868,7 @@ ccio_alloc_consistent(struct device *dev, size_t size, dma_addr_t *dma_handle, g } /** - * ccio_free_consistent - Free a consistent DMA mapping. + * ccio_free - Free a consistent DMA mapping. * @dev: The PCI device. * @size: The length of the DMA region. * @cpu_addr: The cpu address returned from the ccio_alloc_consistent. @@ -867,10 +877,10 @@ ccio_alloc_consistent(struct device *dev, size_t size, dma_addr_t *dma_handle, g * This function implements the pci_free_consistent function. */ static void -ccio_free_consistent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) +ccio_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { - ccio_unmap_single(dev, dma_handle, size, 0); + ccio_unmap_page(dev, dma_handle, size, 0, NULL); free_pages((unsigned long)cpu_addr, get_order(size)); } @@ -897,7 +907,7 @@ ccio_free_consistent(struct device *dev, size_t size, void *cpu_addr, */ static int ccio_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; int coalesced, filled = 0; @@ -974,7 +984,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, int nents, */ static void ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; @@ -993,27 +1003,22 @@ ccio_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, #ifdef CCIO_COLLECT_STATS ioc->usg_pages += sg_dma_len(sglist) >> PAGE_SHIFT; #endif - ccio_unmap_single(dev, sg_dma_address(sglist), - sg_dma_len(sglist), direction); + ccio_unmap_page(dev, sg_dma_address(sglist), + sg_dma_len(sglist), direction, NULL); ++sglist; } DBG_RUN_SG("%s() DONE (nents %d)\n", __func__, nents); } -static struct hppa_dma_ops ccio_ops = { +static struct dma_map_ops ccio_ops = { .dma_supported = ccio_dma_supported, - .alloc_consistent = ccio_alloc_consistent, - .alloc_noncoherent = ccio_alloc_consistent, - .free_consistent = ccio_free_consistent, - .map_single = ccio_map_single, - .unmap_single = ccio_unmap_single, + .alloc = ccio_alloc, + .free = ccio_free, + .map_page = ccio_map_page, + .unmap_page = ccio_unmap_page, .map_sg = ccio_map_sg, .unmap_sg = ccio_unmap_sg, - .dma_sync_single_for_cpu = NULL, /* NOP for U2/Uturn */ - .dma_sync_single_for_device = NULL, /* NOP for U2/Uturn */ - .dma_sync_sg_for_cpu = NULL, /* ditto */ - .dma_sync_sg_for_device = NULL, /* ditto */ }; #ifdef CONFIG_PROC_FS @@ -1062,7 +1067,7 @@ static int ccio_proc_info(struct seq_file *m, void *p) ioc->msingle_calls, ioc->msingle_pages, (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls)); - /* KLUGE - unmap_sg calls unmap_single for each mapped page */ + /* KLUGE - unmap_sg calls unmap_page for each mapped page */ min = ioc->usingle_calls - ioc->usg_calls; max = ioc->usingle_pages - ioc->usg_pages; seq_printf(m, "pci_unmap_single: %8ld calls %8ld pages (avg %d/1000)\n", diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 225049b..42ec460 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -780,8 +780,18 @@ sba_map_single(struct device *dev, void *addr, size_t size, } +static dma_addr_t +sba_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return sba_map_single(dev, page_address(page) + offset, size, + direction); +} + + /** - * sba_unmap_single - unmap one IOVA and free resources + * sba_unmap_page - unmap one IOVA and free resources * @dev: instance of PCI owned by the driver that's asking. * @iova: IOVA of driver buffer previously mapped. * @size: number of bytes mapped in driver buffer. @@ -790,8 +800,8 @@ sba_map_single(struct device *dev, void *addr, size_t size, * See Documentation/DMA-API-HOWTO.txt */ static void -sba_unmap_single(struct device *dev, dma_addr_t iova, size_t size, - enum dma_data_direction direction) +sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; #if DELAYED_RESOURCE_CNT > 0 @@ -858,15 +868,15 @@ sba_unmap_single(struct device *dev, dma_addr_t iova, size_t size, /** - * sba_alloc_consistent - allocate/map shared mem for DMA + * sba_alloc - allocate/map shared mem for DMA * @hwdev: instance of PCI owned by the driver that's asking. * @size: number of bytes mapped in driver buffer. * @dma_handle: IOVA of new buffer. * * See Documentation/DMA-API-HOWTO.txt */ -static void *sba_alloc_consistent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static void *sba_alloc(struct device *hwdev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, struct dma_attrs *attrs) { void *ret; @@ -888,7 +898,7 @@ static void *sba_alloc_consistent(struct device *hwdev, size_t size, /** - * sba_free_consistent - free/unmap shared mem for DMA + * sba_free - free/unmap shared mem for DMA * @hwdev: instance of PCI owned by the driver that's asking. * @size: number of bytes mapped in driver buffer. * @vaddr: virtual address IOVA of "consistent" buffer. @@ -897,10 +907,10 @@ static void *sba_alloc_consistent(struct device *hwdev, size_t size, * See Documentation/DMA-API-HOWTO.txt */ static void -sba_free_consistent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dma_handle) +sba_free(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { - sba_unmap_single(hwdev, dma_handle, size, 0); + sba_unmap_page(hwdev, dma_handle, size, 0, NULL); free_pages((unsigned long) vaddr, get_order(size)); } @@ -933,7 +943,7 @@ int dump_run_sg = 0; */ static int sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; int coalesced, filled = 0; @@ -1016,7 +1026,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, int nents, */ static void sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) + enum dma_data_direction direction, struct dma_attrs *attrs) { struct ioc *ioc; #ifdef ASSERT_PDIR_SANITY @@ -1040,7 +1050,8 @@ sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, while (sg_dma_len(sglist) && nents--) { - sba_unmap_single(dev, sg_dma_address(sglist), sg_dma_len(sglist), direction); + sba_unmap_page(dev, sg_dma_address(sglist), sg_dma_len(sglist), + direction, NULL); #ifdef SBA_COLLECT_STATS ioc->usg_pages += ((sg_dma_address(sglist) & ~IOVP_MASK) + sg_dma_len(sglist) + IOVP_SIZE - 1) >> PAGE_SHIFT; ioc->usingle_calls--; /* kluge since call is unmap_sg() */ @@ -1058,19 +1069,14 @@ sba_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, } -static struct hppa_dma_ops sba_ops = { +static struct dma_map_ops sba_ops = { .dma_supported = sba_dma_supported, - .alloc_consistent = sba_alloc_consistent, - .alloc_noncoherent = sba_alloc_consistent, - .free_consistent = sba_free_consistent, - .map_single = sba_map_single, - .unmap_single = sba_unmap_single, + .alloc = sba_alloc, + .free = sba_free, + .map_page = sba_map_page, + .unmap_page = sba_unmap_page, .map_sg = sba_map_sg, .unmap_sg = sba_unmap_sg, - .dma_sync_single_for_cpu = NULL, - .dma_sync_single_for_device = NULL, - .dma_sync_sg_for_cpu = NULL, - .dma_sync_sg_for_device = NULL, }; -- cgit v0.10.2 From f151341ca00e0418f98a5131e1a4a2a3ec219653 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:50 -0800 Subject: mn10300: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: David Howells Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 78ae555..e8ebf78 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -14,6 +14,8 @@ config MN10300 select OLD_SIGSUSPEND3 select OLD_SIGACTION select HAVE_DEBUG_STACKOVERFLOW + select ARCH_NO_COHERENT_DMA_MMAP + select HAVE_DMA_ATTRS config AM33_2 def_bool n diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h index a18abfc..e69b013 100644 --- a/arch/mn10300/include/asm/dma-mapping.h +++ b/arch/mn10300/include/asm/dma-mapping.h @@ -11,154 +11,14 @@ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#include -#include - #include #include -/* - * See Documentation/DMA-API.txt for the description of how the - * following DMA API should work. - */ - -extern void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); - -extern void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent((d), (s), (h), (f)) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent((d), (s), (v), (h)) - -static inline -dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - mn10300_dcache_flush_inv(); - return virt_to_bus(ptr); -} - -static inline -void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline -int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nents == 0 || sglist[0].length == 0); - - for_each_sg(sglist, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg->dma_address = sg_phys(sg); - } - - mn10300_dcache_flush_inv(); - return nents; -} - -static inline -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); -} - -static inline -dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - return page_to_bus(page) + offset; -} - -static inline -void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline -void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ -} - -static inline -void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - mn10300_dcache_flush_inv(); -} - -static inline -void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - mn10300_dcache_flush_inv(); -} - +extern struct dma_map_ops mn10300_dma_ops; -static inline -void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { -} - -static inline -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction direction) -{ - mn10300_dcache_flush_inv(); -} - -static inline -int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -static inline -int dma_supported(struct device *dev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, so we can't - * guarantee allocations that must be within a tighter range than - * GFP_DMA - */ - if (mask < 0x00ffffff) - return 0; - return 1; -} - -static inline -int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - return 0; + return &mn10300_dma_ops; } static inline @@ -168,19 +28,6 @@ void dma_cache_sync(void *vaddr, size_t size, mn10300_dcache_flush_inv(); } -/* Not supported for now */ -static inline int dma_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size) -{ - return -EINVAL; -} - -static inline int dma_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size) -{ - return -EINVAL; -} +#include #endif diff --git a/arch/mn10300/mm/dma-alloc.c b/arch/mn10300/mm/dma-alloc.c index e244ebe..8842394 100644 --- a/arch/mn10300/mm/dma-alloc.c +++ b/arch/mn10300/mm/dma-alloc.c @@ -20,8 +20,8 @@ static unsigned long pci_sram_allocated = 0xbc000000; -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) +static void *mn10300_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { unsigned long addr; void *ret; @@ -61,10 +61,9 @@ done: printk("dma_alloc_coherent() = %p [%x]\n", ret, *dma_handle); return ret; } -EXPORT_SYMBOL(dma_alloc_coherent); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) +static void mn10300_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { unsigned long addr = (unsigned long) vaddr & ~0x20000000; @@ -73,4 +72,60 @@ void dma_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages(addr, get_order(size)); } -EXPORT_SYMBOL(dma_free_coherent); + +static int mn10300_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sglist, sg, nents, i) { + BUG_ON(!sg_page(sg)); + + sg->dma_address = sg_phys(sg); + } + + mn10300_dcache_flush_inv(); + return nents; +} + +static dma_addr_t mn10300_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) +{ + return page_to_bus(page) + offset; +} + +static void mn10300_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + mn10300_dcache_flush_inv(); +} + +static void mn10300_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction direction) +{ + mn10300_dcache_flush_inv(); +} + +static int mn10300_dma_supported(struct device *dev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, so we can't + * guarantee allocations that must be within a tighter range than + * GFP_DMA + */ + if (mask < 0x00ffffff) + return 0; + return 1; +} + +struct dma_map_ops mn10300_dma_ops = { + .alloc = mn10300_dma_alloc, + .free = mn10300_dma_free, + .map_page = mn10300_dma_map_page, + .map_sg = mn10300_dma_map_sg, + .sync_single_for_device = mn10300_dma_sync_single_for_device, + .sync_sg_for_device = mn10300_dma_sync_sg_for_device, +}; -- cgit v0.10.2 From 340f3039acd67ec7750e36bd327caadadaacaaf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:53 -0800 Subject: m68k: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: Geert Uytterhoeven Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 498b567..d5d75b3 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -23,6 +23,7 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION + select HAVE_DMA_ATTRS config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h index 05aa535..2c082a6 100644 --- a/arch/m68k/include/asm/dma-mapping.h +++ b/arch/m68k/include/asm/dma-mapping.h @@ -1,123 +1,19 @@ #ifndef _M68K_DMA_MAPPING_H #define _M68K_DMA_MAPPING_H -#include +extern struct dma_map_ops m68k_dma_ops; -struct scatterlist; - -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 1; -} - -static inline int dma_set_mask(struct device *dev, u64 mask) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - return 0; + return &m68k_dma_ops; } -extern void *dma_alloc_coherent(struct device *, size_t, - dma_addr_t *, gfp_t); -extern void dma_free_coherent(struct device *, size_t, - void *, dma_addr_t); +#include -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) -{ - /* attrs is not supported and ignored */ - return dma_alloc_coherent(dev, size, dma_handle, flag); -} - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - struct dma_attrs *attrs) -{ - /* attrs is not supported and ignored */ - dma_free_coherent(dev, size, cpu_addr, dma_handle); -} - -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t flag) -{ - return dma_alloc_coherent(dev, size, handle, flag); -} -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *addr, dma_addr_t handle) -{ - dma_free_coherent(dev, size, addr, handle); -} static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) { /* we use coherent allocation, so not much to do here. */ } -extern dma_addr_t dma_map_single(struct device *, void *, size_t, - enum dma_data_direction); -static inline void dma_unmap_single(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ -} - -extern dma_addr_t dma_map_page(struct device *, struct page *, - unsigned long, size_t size, - enum dma_data_direction); -static inline void dma_unmap_page(struct device *dev, dma_addr_t address, - size_t size, enum dma_data_direction dir) -{ -} - -extern int dma_map_sg(struct device *, struct scatterlist *, int, - enum dma_data_direction); -static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, enum dma_data_direction dir) -{ -} - -extern void dma_sync_single_for_device(struct device *, dma_addr_t, size_t, - enum dma_data_direction); -extern void dma_sync_sg_for_device(struct device *, struct scatterlist *, int, - enum dma_data_direction); - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything for now */ - dma_sync_single_for_device(dev, dma_handle, offset + size, direction); -} - -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) -{ -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything for now */ - dma_sync_single_for_cpu(dev, dma_handle, offset + size, direction); -} - -static inline int dma_mapping_error(struct device *dev, dma_addr_t handle) -{ - return 0; -} - -/* drivers/base/dma-mapping.c */ -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_mmap_coherent(d, v, c, h, s) dma_common_mmap(d, v, c, h, s) -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) - #endif /* _M68K_DMA_MAPPING_H */ diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index 564665f..cbc78b4 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c @@ -18,8 +18,8 @@ #if defined(CONFIG_MMU) && !defined(CONFIG_COLDFIRE) -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t flag) +static void *m68k_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t flag, struct dma_attrs *attrs) { struct page *page, **map; pgprot_t pgprot; @@ -61,8 +61,8 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return addr; } -void dma_free_coherent(struct device *dev, size_t size, - void *addr, dma_addr_t handle) +static void m68k_dma_free(struct device *dev, size_t size, void *addr, + dma_addr_t handle, struct dma_attrs *attrs) { pr_debug("dma_free_coherent: %p, %x\n", addr, handle); vfree(addr); @@ -72,8 +72,8 @@ void dma_free_coherent(struct device *dev, size_t size, #include -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) +static void *m68k_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs) { void *ret; /* ignore region specifiers */ @@ -90,19 +90,16 @@ void *dma_alloc_coherent(struct device *dev, size_t size, return ret; } -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +static void m68k_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { free_pages((unsigned long)vaddr, get_order(size)); } #endif /* CONFIG_MMU && !CONFIG_COLDFIRE */ -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - -void dma_sync_single_for_device(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) +static void m68k_dma_sync_single_for_device(struct device *dev, + dma_addr_t handle, size_t size, enum dma_data_direction dir) { switch (dir) { case DMA_BIDIRECTIONAL: @@ -118,10 +115,9 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t handle, break; } } -EXPORT_SYMBOL(dma_sync_single_for_device); -void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nents, enum dma_data_direction dir) +static void m68k_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nents, enum dma_data_direction dir) { int i; struct scatterlist *sg; @@ -131,31 +127,19 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, dir); } } -EXPORT_SYMBOL(dma_sync_sg_for_device); - -dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, - enum dma_data_direction dir) -{ - dma_addr_t handle = virt_to_bus(addr); - - dma_sync_single_for_device(dev, handle, size, dir); - return handle; -} -EXPORT_SYMBOL(dma_map_single); -dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir) +static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) { dma_addr_t handle = page_to_phys(page) + offset; dma_sync_single_for_device(dev, handle, size, dir); return handle; } -EXPORT_SYMBOL(dma_map_page); -int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction dir) +static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction dir, struct dma_attrs *attrs) { int i; struct scatterlist *sg; @@ -167,4 +151,13 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, } return nents; } -EXPORT_SYMBOL(dma_map_sg); + +struct dma_map_ops m68k_dma_ops = { + .alloc = m68k_dma_alloc, + .free = m68k_dma_free, + .map_page = m68k_dma_map_page, + .map_sg = m68k_dma_map_sg, + .sync_single_for_device = m68k_dma_sync_single_for_device, + .sync_sg_for_device = m68k_dma_sync_sg_for_device, +}; +EXPORT_SYMBOL(m68k_dma_ops); -- cgit v0.10.2 From 5348c1e9e0dc2b62a484c4b74a8d1d59aa9620a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:56 -0800 Subject: metag: convert to dma_map_ops Signed-off-by: Christoph Hellwig Cc: James Hogan Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index a0fa88d..ad8604c 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -29,6 +29,7 @@ config METAG select OF select OF_EARLY_FLATTREE select SPARSE_IRQ + select HAVE_DMA_ATTRS config STACKTRACE_SUPPORT def_bool y diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h index eb5cdec..768f2e3 100644 --- a/arch/metag/include/asm/dma-mapping.h +++ b/arch/metag/include/asm/dma-mapping.h @@ -1,178 +1,14 @@ #ifndef _ASM_METAG_DMA_MAPPING_H #define _ASM_METAG_DMA_MAPPING_H -#include +extern struct dma_map_ops metag_dma_ops; -#include -#include -#include -#include - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - -void dma_sync_for_device(void *vaddr, size_t size, int dma_direction); -void dma_sync_for_cpu(void *vaddr, size_t size, int dma_direction); - -int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); - -int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); - -static inline dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(size == 0); - dma_sync_for_device(ptr, size, direction); - return virt_to_phys(ptr); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - dma_sync_for_cpu(phys_to_virt(dma_addr), size, direction); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nents == 0 || sglist[0].length == 0); - - for_each_sg(sglist, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg->dma_address = sg_phys(sg); - dma_sync_for_device(sg_virt(sg), sg->length, direction); - } - - return nents; -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(!valid_dma_direction(direction)); - dma_sync_for_device((void *)(page_to_phys(page) + offset), size, - direction); - return page_to_phys(page) + offset; -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - BUG_ON(!valid_dma_direction(direction)); - dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); + return &metag_dma_ops; } - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nhwentries, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nhwentries == 0 || sglist[0].length == 0); - - for_each_sg(sglist, sg, nhwentries, i) { - BUG_ON(!sg_page(sg)); - - sg->dma_address = sg_phys(sg); - dma_sync_for_cpu(sg_virt(sg), sg->length, direction); - } -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - dma_sync_for_cpu(phys_to_virt(dma_handle), size, direction); -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction) -{ - dma_sync_for_device(phys_to_virt(dma_handle), size, direction); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - dma_sync_for_cpu(phys_to_virt(dma_handle)+offset, size, - direction); -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - dma_sync_for_device(phys_to_virt(dma_handle)+offset, size, - direction); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems, - enum dma_data_direction direction) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nelems, i) - dma_sync_for_cpu(sg_virt(sg), sg->length, direction); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, - int nelems, enum dma_data_direction direction) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sglist, sg, nelems, i) - dma_sync_for_device(sg_virt(sg), sg->length, direction); -} - -static inline int -dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -#define dma_supported(dev, mask) (1) - -static inline int -dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} +#include /* * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to @@ -184,11 +20,4 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size, { } -/* drivers/base/dma-mapping.c */ -extern int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, - size_t size); - -#define dma_get_sgtable(d, t, v, h, s) dma_common_get_sgtable(d, t, v, h, s) - #endif diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c index c700d62..e12368d 100644 --- a/arch/metag/kernel/dma.c +++ b/arch/metag/kernel/dma.c @@ -171,8 +171,8 @@ out: * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. */ -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp) +static void *metag_dma_alloc(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) { struct page *page; struct metag_vm_region *c; @@ -263,13 +263,12 @@ void *dma_alloc_coherent(struct device *dev, size_t size, no_page: return NULL; } -EXPORT_SYMBOL(dma_alloc_coherent); /* * free a page as defined by the above mapping. */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) +static void metag_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { struct metag_vm_region *c; unsigned long flags, addr; @@ -329,16 +328,19 @@ no_area: __func__, vaddr); dump_stack(); } -EXPORT_SYMBOL(dma_free_coherent); - -static int dma_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) +static int metag_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + struct dma_attrs *attrs) { - int ret = -ENXIO; - unsigned long flags, user_size, kern_size; struct metag_vm_region *c; + int ret = -ENXIO; + + if (dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs)) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + else + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); user_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; @@ -364,25 +366,6 @@ static int dma_mmap(struct device *dev, struct vm_area_struct *vma, return ret; } -int dma_mmap_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) -{ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - return dma_mmap(dev, vma, cpu_addr, dma_addr, size); -} -EXPORT_SYMBOL(dma_mmap_coherent); - -int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) -{ - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - return dma_mmap(dev, vma, cpu_addr, dma_addr, size); -} -EXPORT_SYMBOL(dma_mmap_writecombine); - - - - /* * Initialise the consistent memory allocation. */ @@ -423,7 +406,7 @@ early_initcall(dma_alloc_init); /* * make an area consistent to devices. */ -void dma_sync_for_device(void *vaddr, size_t size, int dma_direction) +static void dma_sync_for_device(void *vaddr, size_t size, int dma_direction) { /* * Ensure any writes get through the write combiner. This is necessary @@ -465,12 +448,11 @@ void dma_sync_for_device(void *vaddr, size_t size, int dma_direction) wmb(); } -EXPORT_SYMBOL(dma_sync_for_device); /* * make an area consistent to the core. */ -void dma_sync_for_cpu(void *vaddr, size_t size, int dma_direction) +static void dma_sync_for_cpu(void *vaddr, size_t size, int dma_direction) { /* * Hardware L2 cache prefetch doesn't occur across 4K physical @@ -497,4 +479,100 @@ void dma_sync_for_cpu(void *vaddr, size_t size, int dma_direction) rmb(); } -EXPORT_SYMBOL(dma_sync_for_cpu); + +static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, struct dma_attrs *attrs) +{ + dma_sync_for_device((void *)(page_to_phys(page) + offset), size, + direction); + return page_to_phys(page) + offset; +} + +static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); +} + +static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, + int nents, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sglist, sg, nents, i) { + BUG_ON(!sg_page(sg)); + + sg->dma_address = sg_phys(sg); + dma_sync_for_device(sg_virt(sg), sg->length, direction); + } + + return nents; +} + + +static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nhwentries, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sglist, sg, nhwentries, i) { + BUG_ON(!sg_page(sg)); + + sg->dma_address = sg_phys(sg); + dma_sync_for_cpu(sg_virt(sg), sg->length, direction); + } +} + +static void metag_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + dma_sync_for_cpu(phys_to_virt(dma_handle), size, direction); +} + +static void metag_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + dma_sync_for_device(phys_to_virt(dma_handle), size, direction); +} + +static void metag_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) + dma_sync_for_cpu(sg_virt(sg), sg->length, direction); +} + +static void metag_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sglist, sg, nelems, i) + dma_sync_for_device(sg_virt(sg), sg->length, direction); +} + +struct dma_map_ops metag_dma_ops = { + .alloc = metag_dma_alloc, + .free = metag_dma_free, + .map_page = metag_dma_map_page, + .map_sg = metag_dma_map_sg, + .sync_single_for_device = metag_dma_sync_single_for_device, + .sync_single_for_cpu = metag_dma_sync_single_for_cpu, + .sync_sg_for_cpu = metag_dma_sync_sg_for_cpu, + .mmap = metag_dma_mmap, +}; +EXPORT_SYMBOL(metag_dma_ops); -- cgit v0.10.2 From 30081d8ea47d521e8804398b25f59b8e49a2ed0b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:01:59 -0800 Subject: sparc: use generic dma_set_mask Sparc already uses the same code as the generic code for the PCI implementation but just fails the call sbus. This moves to the generic implemenation which eventually return -EIO due to the NULL dma_mask pointer in the device. Signed-off-by: Christoph Hellwig Cc: "David S. Miller" Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index a21da59..2777092 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -37,21 +37,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return dma_ops; } -#define HAVE_ARCH_DMA_SET_MASK 1 - -static inline int dma_set_mask(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) { - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EINVAL; - *dev->dma_mask = mask; - return 0; - } -#endif - return -EINVAL; -} - #include #endif -- cgit v0.10.2 From bd38118f9c57b22f57f9c2fccca4a82aef15cc5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:02:02 -0800 Subject: tile: uninline dma_set_mask We'll soon merge into and the reference to dma_capable in the tile dma_set_mask would create a circular dependency. Fix this by moving the implementation out of line. Signed-off-by: Christoph Hellwig Cc: Chris Metcalf Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h index 96ac6cc..c342736 100644 --- a/arch/tile/include/asm/dma-mapping.h +++ b/arch/tile/include/asm/dma-mapping.h @@ -76,34 +76,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) #include -static inline int -dma_set_mask(struct device *dev, u64 mask) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - /* - * For PCI devices with 64-bit DMA addressing capability, promote - * the dma_ops to hybrid, with the consistent memory DMA space limited - * to 32-bit. For 32-bit capable devices, limit the streaming DMA - * address range to max_direct_dma_addr. - */ - if (dma_ops == gx_pci_dma_map_ops || - dma_ops == gx_hybrid_pci_dma_map_ops || - dma_ops == gx_legacy_pci_dma_map_ops) { - if (mask == DMA_BIT_MASK(64) && - dma_ops == gx_legacy_pci_dma_map_ops) - set_dma_ops(dev, gx_hybrid_pci_dma_map_ops); - else if (mask > dev->archdata.max_direct_dma_addr) - mask = dev->archdata.max_direct_dma_addr; - } - - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} +int dma_set_mask(struct device *dev, u64 mask); /* * dma_alloc_noncoherent() is #defined to return coherent memory, diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c index 09b5870..b6bc054 100644 --- a/arch/tile/kernel/pci-dma.c +++ b/arch/tile/kernel/pci-dma.c @@ -583,6 +583,35 @@ struct dma_map_ops *gx_hybrid_pci_dma_map_ops; EXPORT_SYMBOL(gx_legacy_pci_dma_map_ops); EXPORT_SYMBOL(gx_hybrid_pci_dma_map_ops); +int dma_set_mask(struct device *dev, u64 mask) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + /* + * For PCI devices with 64-bit DMA addressing capability, promote + * the dma_ops to hybrid, with the consistent memory DMA space limited + * to 32-bit. For 32-bit capable devices, limit the streaming DMA + * address range to max_direct_dma_addr. + */ + if (dma_ops == gx_pci_dma_map_ops || + dma_ops == gx_hybrid_pci_dma_map_ops || + dma_ops == gx_legacy_pci_dma_map_ops) { + if (mask == DMA_BIT_MASK(64) && + dma_ops == gx_legacy_pci_dma_map_ops) + set_dma_ops(dev, gx_hybrid_pci_dma_map_ops); + else if (mask > dev->archdata.max_direct_dma_addr) + mask = dev->archdata.max_direct_dma_addr; + } + + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; + + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { -- cgit v0.10.2 From e1c7e324539ada3b2b13ca2898bcb4948a9ef9db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:02:05 -0800 Subject: dma-mapping: always provide the dma_map_ops based implementation Move the generic implementation to now that all architectures support it and remove the HAVE_DMA_ATTR Kconfig symbol now that everyone supports them. [valentinrothberg@gmail.com: remove leftovers in Kconfig] Signed-off-by: Christoph Hellwig Cc: "David S. Miller" Cc: Aurelien Jacquiot Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jesper Nilsson Cc: Koichi Yasutake Cc: Ley Foon Tan Cc: Mark Salter Cc: Mikael Starvik Cc: Steven Miao Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Valentin Rothberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt index d69b3fc..781024e 100644 --- a/Documentation/DMA-API-HOWTO.txt +++ b/Documentation/DMA-API-HOWTO.txt @@ -951,16 +951,6 @@ to "Closing". alignment constraints (e.g. the alignment constraints about 64-bit objects). -3) Supporting multiple types of IOMMUs - - If your architecture needs to support multiple types of IOMMUs, you - can use include/linux/asm-generic/dma-mapping-common.h. It's a - library to support the DMA API with multiple types of IOMMUs. Lots - of architectures (x86, powerpc, sh, alpha, ia64, microblaze and - sparc) use it. Choose one to see how it can be used. If you need to - support multiple types of IOMMUs in a single system, the example of - x86 or powerpc helps. - Closing This document, and the API itself, would not be in its current diff --git a/Documentation/features/io/dma_map_attrs/arch-support.txt b/Documentation/features/io/dma_map_attrs/arch-support.txt deleted file mode 100644 index 51d0f1c..0000000 --- a/Documentation/features/io/dma_map_attrs/arch-support.txt +++ /dev/null @@ -1,40 +0,0 @@ -# -# Feature name: dma_map_attrs -# Kconfig: HAVE_DMA_ATTRS -# description: arch provides dma_*map*_attrs() APIs -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | ok | - | arc: | TODO | - | arm: | ok | - | arm64: | ok | - | avr32: | TODO | - | blackfin: | TODO | - | c6x: | TODO | - | cris: | TODO | - | frv: | TODO | - | h8300: | ok | - | hexagon: | ok | - | ia64: | ok | - | m32r: | TODO | - | m68k: | TODO | - | metag: | TODO | - | microblaze: | ok | - | mips: | ok | - | mn10300: | TODO | - | nios2: | TODO | - | openrisc: | ok | - | parisc: | TODO | - | powerpc: | ok | - | s390: | ok | - | score: | TODO | - | sh: | ok | - | sparc: | ok | - | tile: | ok | - | um: | TODO | - | unicore32: | ok | - | x86: | ok | - | xtensa: | TODO | - ----------------------- diff --git a/arch/Kconfig b/arch/Kconfig index 51c03ef..f6b649d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -205,9 +205,6 @@ config HAVE_NMI_WATCHDOG config HAVE_ARCH_TRACEHOOK bool -config HAVE_DMA_ATTRS - bool - config HAVE_DMA_CONTIGUOUS bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index f515a4d..9d8a858 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -9,7 +9,6 @@ config ALPHA select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS - select HAVE_DMA_ATTRS select VIRT_TO_BUS select GENERIC_IRQ_PROBE select AUTO_IRQ_AFFINITY if SMP diff --git a/arch/alpha/include/asm/dma-mapping.h b/arch/alpha/include/asm/dma-mapping.h index 72a8ca7..3c3451f 100644 --- a/arch/alpha/include/asm/dma-mapping.h +++ b/arch/alpha/include/asm/dma-mapping.h @@ -10,8 +10,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return dma_ops; } -#include - #define dma_cache_sync(dev, va, size, dir) ((void)0) #endif /* _ALPHA_DMA_MAPPING_H */ diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 8150c27..76dde9d 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -38,7 +38,6 @@ config ARC select OF_EARLY_FLATTREE select PERF_USE_VMALLOC select HAVE_DEBUG_STACKOVERFLOW - select HAVE_DMA_ATTRS config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h index 2a617f9..6602054 100644 --- a/arch/arc/include/asm/dma-mapping.h +++ b/arch/arc/include/asm/dma-mapping.h @@ -18,6 +18,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &arc_dma_ops; } -#include - #endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6a889af..5231177 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -47,7 +47,6 @@ config ARM select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index ccb3aa6..6ad1ced 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -41,13 +41,6 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) #define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *dev, u64 mask); -/* - * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent - * implementations, we don't provide a dma_cache_sync function so drivers using - * this API are highlighted with build warnings. - */ -#include - #ifdef __arch_page_to_dma #error Please update to __arch_pfn_to_dma #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6be3fa2..8cc6228 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -64,7 +64,6 @@ config ARM64 select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h index 61e08f3..ba437f0 100644 --- a/arch/arm64/include/asm/dma-mapping.h +++ b/arch/arm64/include/asm/dma-mapping.h @@ -64,8 +64,6 @@ static inline bool is_device_dma_coherent(struct device *dev) return dev->archdata.dma_coherent; } -#include - static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { return (dma_addr_t)paddr; diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index aac3d69..b6878eb 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -7,7 +7,6 @@ config AVR32 select HAVE_OPROFILE select HAVE_KPROBES select VIRT_TO_BUS - select HAVE_DMA_ATTRS select GENERIC_IRQ_PROBE select GENERIC_ATOMIC64 select HARDIRQS_SW_RESEND diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h index 0239ca8..1115f2a 100644 --- a/arch/avr32/include/asm/dma-mapping.h +++ b/arch/avr32/include/asm/dma-mapping.h @@ -11,6 +11,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &avr32_dma_ops; } -#include - #endif /* __ASM_AVR32_DMA_MAPPING_H */ diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 4be2f90..af76634 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -14,7 +14,6 @@ config BLACKFIN def_bool y select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select HAVE_DMA_ATTRS select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/blackfin/include/asm/dma-mapping.h b/arch/blackfin/include/asm/dma-mapping.h index ea5a2e8..3490570 100644 --- a/arch/blackfin/include/asm/dma-mapping.h +++ b/arch/blackfin/include/asm/dma-mapping.h @@ -43,6 +43,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &bfin_dma_ops; } -#include - #endif /* _BLACKFIN_DMA_MAPPING_H */ diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 8602f72..79049d4 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -18,7 +18,6 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select ARCH_NO_COHERENT_DMA_MMAP - select HAVE_DMA_ATTRS config MMU def_bool n diff --git a/arch/c6x/include/asm/dma-mapping.h b/arch/c6x/include/asm/dma-mapping.h index f881e42..6b5cd7b 100644 --- a/arch/c6x/include/asm/dma-mapping.h +++ b/arch/c6x/include/asm/dma-mapping.h @@ -24,8 +24,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &c6x_dma_ops; } -#include - extern void coherent_mem_init(u32 start, u32 size); void *c6x_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs); diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 20d919c..e086f9e 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -54,7 +54,6 @@ config CRIS select GENERIC_ATOMIC64 select HAVE_UID16 select VIRT_TO_BUS - select HAVE_DMA_ATTRS select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW select GENERIC_IOMAP diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h index 34e7c7c7..5a37017 100644 --- a/arch/cris/include/asm/dma-mapping.h +++ b/arch/cris/include/asm/dma-mapping.h @@ -16,8 +16,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) } #endif -#include - static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index e383781..eefd9a4 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -16,7 +16,6 @@ config FRV select OLD_SIGACTION select HAVE_DEBUG_STACKOVERFLOW select ARCH_NO_COHERENT_DMA_MMAP - select HAVE_DMA_ATTRS config ZONE_DMA bool diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h index 750951c..9a82bfa 100644 --- a/arch/frv/include/asm/dma-mapping.h +++ b/arch/frv/include/asm/dma-mapping.h @@ -21,6 +21,4 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, flush_write_buffers(); } -#include - #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 2e20333..8c7c825 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -15,7 +15,6 @@ config H8300 select OF_IRQ select OF_EARLY_FLATTREE select HAVE_MEMBLOCK - select HAVE_DMA_ATTRS select CLKSRC_OF select H8300_TMR8 diff --git a/arch/h8300/include/asm/dma-mapping.h b/arch/h8300/include/asm/dma-mapping.h index d9b5b80..7ac7fad 100644 --- a/arch/h8300/include/asm/dma-mapping.h +++ b/arch/h8300/include/asm/dma-mapping.h @@ -8,6 +8,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &h8300_dma_map_ops; } -#include - #endif diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 4dc89d1..57298e7 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -27,7 +27,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES - select HAVE_DMA_ATTRS ---help--- Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/hexagon/include/asm/dma-mapping.h b/arch/hexagon/include/asm/dma-mapping.h index 268fde8..aa62034 100644 --- a/arch/hexagon/include/asm/dma-mapping.h +++ b/arch/hexagon/include/asm/dma-mapping.h @@ -49,8 +49,6 @@ extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle); extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction); -#include - static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index eb0249e..fb0515e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -25,7 +25,6 @@ config IA64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE if (!ITANIUM) select HAVE_FUNCTION_TRACER - select HAVE_DMA_ATTRS select TTY select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 9beccf8..d472805 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -25,8 +25,6 @@ extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int, #define get_dma_ops(dev) platform_dma_get_ops(dev) -#include - static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index d5d75b3..498b567 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -23,7 +23,6 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select HAVE_DMA_ATTRS config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h index 2c082a6..96c5361 100644 --- a/arch/m68k/include/asm/dma-mapping.h +++ b/arch/m68k/include/asm/dma-mapping.h @@ -8,8 +8,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &m68k_dma_ops; } -#include - static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) { diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index ad8604c..a0fa88d 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -29,7 +29,6 @@ config METAG select OF select OF_EARLY_FLATTREE select SPARSE_IRQ - select HAVE_DMA_ATTRS config STACKTRACE_SUPPORT def_bool y diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h index 768f2e3..27af5d47 100644 --- a/arch/metag/include/asm/dma-mapping.h +++ b/arch/metag/include/asm/dma-mapping.h @@ -8,8 +8,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &metag_dma_ops; } -#include - /* * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to * do any flushing here. diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 5ecd028..53b69de 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -19,7 +19,6 @@ config MICROBLAZE select HAVE_ARCH_KGDB select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h index 24b1297..1884783 100644 --- a/arch/microblaze/include/asm/dma-mapping.h +++ b/arch/microblaze/include/asm/dma-mapping.h @@ -44,8 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &dma_direct_ops; } -#include - static inline void __dma_sync(unsigned long paddr, size_t size, enum dma_data_direction direction) { diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 71683a8..fbf3f66 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -31,7 +31,6 @@ config MIPS select RTC_LIB if !MACH_LOONGSON64 select GENERIC_ATOMIC64 if !64BIT select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DMA_API_DEBUG select GENERIC_IRQ_PROBE diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h index e604f76..12fa79e 100644 --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -29,8 +29,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline void dma_mark_clean(void *addr, size_t size) {} -#include - extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction); diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index e8ebf78..10607f0 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -15,7 +15,6 @@ config MN10300 select OLD_SIGACTION select HAVE_DEBUG_STACKOVERFLOW select ARCH_NO_COHERENT_DMA_MMAP - select HAVE_DMA_ATTRS config AM33_2 def_bool n diff --git a/arch/mn10300/include/asm/dma-mapping.h b/arch/mn10300/include/asm/dma-mapping.h index e69b013..1dcd447 100644 --- a/arch/mn10300/include/asm/dma-mapping.h +++ b/arch/mn10300/include/asm/dma-mapping.h @@ -28,6 +28,4 @@ void dma_cache_sync(void *vaddr, size_t size, mn10300_dcache_flush_inv(); } -#include - #endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4b2504d..4375554 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,7 +16,6 @@ config NIOS2 select SOC_BUS select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT - select HAVE_DMA_ATTRS config GENERIC_CSUM def_bool y diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 443f44d..e118c02 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -29,9 +29,6 @@ config OPENRISC config MMU def_bool y -config HAVE_DMA_ATTRS - def_bool y - config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h index 413bfcf..1f260bc 100644 --- a/arch/openrisc/include/asm/dma-mapping.h +++ b/arch/openrisc/include/asm/dma-mapping.h @@ -42,6 +42,4 @@ static inline int dma_supported(struct device *dev, u64 dma_mask) return dma_mask == DMA_BIT_MASK(32); } -#include - #endif /* __ASM_OPENRISC_DMA_MAPPING_H */ diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 1489351..14f655c 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -30,7 +30,6 @@ config PARISC select HAVE_DEBUG_STACKOVERFLOW select HAVE_ARCH_AUDITSYSCALL select ARCH_NO_COHERENT_DMA_MMAP - select HAVE_DMA_ATTRS help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h index 4de5186..16e0246 100644 --- a/arch/parisc/include/asm/dma-mapping.h +++ b/arch/parisc/include/asm/dma-mapping.h @@ -83,6 +83,4 @@ struct parisc_device; void * sba_get_iommu(struct parisc_device *dev); #endif -#include - #endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8310be4..e4824fd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -108,7 +108,6 @@ config PPC select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP - select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 7f522c0..77816ac 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -125,8 +125,6 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off) #define HAVE_ARCH_DMA_SET_MASK 1 extern int dma_set_mask(struct device *dev, u64 dma_mask); -#include - extern int __dma_set_mask(struct device *dev, u64 dma_mask); extern u64 __dma_get_required_mask(struct device *dev); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index dbeeb3a..3be9c83 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -579,7 +579,6 @@ config QDIO menuconfig PCI bool "PCI support" - select HAVE_DMA_ATTRS select PCI_MSI select IOMMU_SUPPORT help diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h index b3fd54d..e64bfcb 100644 --- a/arch/s390/include/asm/dma-mapping.h +++ b/arch/s390/include/asm/dma-mapping.h @@ -23,8 +23,6 @@ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, { } -#include - static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6c391a5..e13da05 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -11,7 +11,6 @@ config SUPERH select HAVE_GENERIC_DMA_COHERENT select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_PERF_EVENTS select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/sh/include/asm/dma-mapping.h b/arch/sh/include/asm/dma-mapping.h index a3745a3..e11cf0c 100644 --- a/arch/sh/include/asm/dma-mapping.h +++ b/arch/sh/include/asm/dma-mapping.h @@ -11,8 +11,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #define DMA_ERROR_CODE 0 -#include - void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3203e42..57ffaf2 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -26,7 +26,6 @@ config SPARC select RTC_CLASS select RTC_DRV_M48T59 select RTC_SYSTOHC - select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_ARCH_JUMP_LABEL if SPARC64 select GENERIC_IRQ_SHOW diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 2777092..1180ae2 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -37,6 +37,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return dma_ops; } -#include - #endif diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 6bfbe8b..de4a4ff 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -5,7 +5,6 @@ config TILE def_bool y select HAVE_PERF_EVENTS select USE_PMC if PERF_EVENTS - select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_KVM if !TILEGX select GENERIC_FIND_FIRST_BIT diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h index c342736..01ceb4a 100644 --- a/arch/tile/include/asm/dma-mapping.h +++ b/arch/tile/include/asm/dma-mapping.h @@ -73,9 +73,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) } #define HAVE_ARCH_DMA_SET_MASK 1 - -#include - int dma_set_mask(struct device *dev, u64 mask); /* diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 8773426..e5602ee 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -5,7 +5,6 @@ config UNICORE32 select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_MEMBLOCK select HAVE_GENERIC_DMA_COHERENT - select HAVE_DMA_ATTRS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select GENERIC_ATOMIC64 diff --git a/arch/unicore32/include/asm/dma-mapping.h b/arch/unicore32/include/asm/dma-mapping.h index 8140e05..4749854 100644 --- a/arch/unicore32/include/asm/dma-mapping.h +++ b/arch/unicore32/include/asm/dma-mapping.h @@ -28,8 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &swiotlb_dma_map_ops; } -#include - static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (dev && dev->dma_mask) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 92b2a73..89159a6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,7 +100,6 @@ config X86 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 953b726..3a27b93 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -#include - extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 82044f7..e9df156 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -15,7 +15,6 @@ config XTENSA select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK select HAVE_DMA_API_DEBUG - select HAVE_DMA_ATTRS select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if !MMU select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h index 66c9ba2..87b7a7d 100644 --- a/arch/xtensa/include/asm/dma-mapping.h +++ b/arch/xtensa/include/asm/dma-mapping.h @@ -30,8 +30,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) return &xtensa_dma_map_ops; } -#include - void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction); diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 59babd5..8ae7ab6 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -82,13 +82,13 @@ config DRM_TTM config DRM_GEM_CMA_HELPER bool - depends on DRM && HAVE_DMA_ATTRS + depends on DRM help Choose this if you need the GEM CMA helper functions config DRM_KMS_CMA_HELPER bool - depends on DRM && HAVE_DMA_ATTRS + depends on DRM select DRM_GEM_CMA_HELPER select DRM_KMS_FB_HELPER select FB_SYS_FILLRECT diff --git a/drivers/gpu/drm/imx/Kconfig b/drivers/gpu/drm/imx/Kconfig index 35ca4f0..a1844b5 100644 --- a/drivers/gpu/drm/imx/Kconfig +++ b/drivers/gpu/drm/imx/Kconfig @@ -5,7 +5,7 @@ config DRM_IMX select VIDEOMODE_HELPERS select DRM_GEM_CMA_HELPER select DRM_KMS_CMA_HELPER - depends on DRM && (ARCH_MXC || ARCH_MULTIPLATFORM) && HAVE_DMA_ATTRS + depends on DRM && (ARCH_MXC || ARCH_MULTIPLATFORM) depends on IMX_IPUV3_CORE help enable i.MX graphics support diff --git a/drivers/gpu/drm/rcar-du/Kconfig b/drivers/gpu/drm/rcar-du/Kconfig index d4e0a39..96dcd4a7 100644 --- a/drivers/gpu/drm/rcar-du/Kconfig +++ b/drivers/gpu/drm/rcar-du/Kconfig @@ -1,6 +1,6 @@ config DRM_RCAR_DU tristate "DRM Support for R-Car Display Unit" - depends on DRM && ARM && HAVE_DMA_ATTRS && OF + depends on DRM && ARM && OF depends on ARCH_SHMOBILE || COMPILE_TEST select DRM_KMS_HELPER select DRM_KMS_CMA_HELPER diff --git a/drivers/gpu/drm/shmobile/Kconfig b/drivers/gpu/drm/shmobile/Kconfig index b9202aa..8d17d00 100644 --- a/drivers/gpu/drm/shmobile/Kconfig +++ b/drivers/gpu/drm/shmobile/Kconfig @@ -1,6 +1,6 @@ config DRM_SHMOBILE tristate "DRM Support for SH Mobile" - depends on DRM && ARM && HAVE_DMA_ATTRS + depends on DRM && ARM depends on ARCH_SHMOBILE || COMPILE_TEST depends on FB_SH_MOBILE_MERAM || !FB_SH_MOBILE_MERAM select BACKLIGHT_CLASS_DEVICE diff --git a/drivers/gpu/drm/sti/Kconfig b/drivers/gpu/drm/sti/Kconfig index 10c1b19..5ad43a1 100644 --- a/drivers/gpu/drm/sti/Kconfig +++ b/drivers/gpu/drm/sti/Kconfig @@ -1,6 +1,6 @@ config DRM_STI tristate "DRM Support for STMicroelectronics SoC stiH41x Series" - depends on DRM && (SOC_STIH415 || SOC_STIH416 || ARCH_MULTIPLATFORM) && HAVE_DMA_ATTRS + depends on DRM && (SOC_STIH415 || SOC_STIH416 || ARCH_MULTIPLATFORM) select RESET_CONTROLLER select DRM_KMS_HELPER select DRM_GEM_CMA_HELPER diff --git a/drivers/gpu/drm/tilcdc/Kconfig b/drivers/gpu/drm/tilcdc/Kconfig index 78beafb..f60a1ec 100644 --- a/drivers/gpu/drm/tilcdc/Kconfig +++ b/drivers/gpu/drm/tilcdc/Kconfig @@ -1,6 +1,6 @@ config DRM_TILCDC tristate "DRM Support for TI LCDC Display Controller" - depends on DRM && OF && ARM && HAVE_DMA_ATTRS + depends on DRM && OF && ARM select DRM_KMS_HELPER select DRM_KMS_FB_HELPER select DRM_KMS_CMA_HELPER diff --git a/drivers/gpu/drm/vc4/Kconfig b/drivers/gpu/drm/vc4/Kconfig index 2d7d115..5848104 100644 --- a/drivers/gpu/drm/vc4/Kconfig +++ b/drivers/gpu/drm/vc4/Kconfig @@ -1,7 +1,7 @@ config DRM_VC4 tristate "Broadcom VC4 Graphics" depends on ARCH_BCM2835 || COMPILE_TEST - depends on DRM && HAVE_DMA_ATTRS + depends on DRM select DRM_KMS_HELPER select DRM_KMS_CMA_HELPER select DRM_GEM_CMA_HELPER diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index 0c53805..5263594 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -216,7 +216,6 @@ config VIDEO_STI_BDISP tristate "STMicroelectronics BDISP 2D blitter driver" depends on VIDEO_DEV && VIDEO_V4L2 depends on ARCH_STI || COMPILE_TEST - depends on HAVE_DMA_ATTRS select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV help diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h deleted file mode 100644 index 6c32af9..0000000 --- a/include/asm-generic/dma-mapping-broken.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef _ASM_GENERIC_DMA_MAPPING_H -#define _ASM_GENERIC_DMA_MAPPING_H - -/* define the dma api to allow compilation but not linking of - * dma dependent code. Code that depends on the dma-mapping - * API needs to set 'depends on HAS_DMA' in its Kconfig - */ - -struct scatterlist; - -extern void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flag); - -extern void -dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle); - -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) -{ - /* attrs is not supported and ignored */ - return dma_alloc_coherent(dev, size, dma_handle, flag); -} - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - struct dma_attrs *attrs) -{ - /* attrs is not supported and ignored */ - dma_free_coherent(dev, size, cpu_addr, dma_handle); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -extern dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction); - -extern void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction); - -extern int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction); - -extern void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction); - -extern dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction); - -extern void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction); - -extern void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction); - -extern void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction); - -extern void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction); - -#define dma_sync_single_for_device dma_sync_single_for_cpu -#define dma_sync_single_range_for_device dma_sync_single_range_for_cpu -#define dma_sync_sg_for_device dma_sync_sg_for_cpu - -extern int -dma_mapping_error(struct device *dev, dma_addr_t dma_addr); - -extern int -dma_supported(struct device *dev, u64 mask); - -extern int -dma_set_mask(struct device *dev, u64 mask); - -extern int -dma_get_cache_alignment(void); - -extern void -dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction direction); - -#endif /* _ASM_GENERIC_DMA_MAPPING_H */ diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h deleted file mode 100644 index b1bc954..0000000 --- a/include/asm-generic/dma-mapping-common.h +++ /dev/null @@ -1,358 +0,0 @@ -#ifndef _ASM_GENERIC_DMA_MAPPING_H -#define _ASM_GENERIC_DMA_MAPPING_H - -#include -#include -#include -#include -#include -#include - -static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, - size_t size, - enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; - - kmemcheck_mark_initialized(ptr, size); - BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, - dir, attrs); - debug_dma_map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, - dir, addr, true); - return addr; -} - -static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, - size_t size, - enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir, true); -} - -/* - * dma_maps_sg_attrs returns 0 on error and > 0 on success. - * It should never return a value < 0. - */ -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - int i, ents; - struct scatterlist *s; - - for_each_sg(sg, s, nents, i) - kmemcheck_mark_initialized(sg_virt(s), s->length); - BUG_ON(!valid_dma_direction(dir)); - ents = ops->map_sg(dev, sg, nents, dir, attrs); - BUG_ON(ents < 0); - debug_dma_map_sg(dev, sg, nents, ents, dir); - - return ents; -} - -static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - debug_dma_unmap_sg(dev, sg, nents, dir); - if (ops->unmap_sg) - ops->unmap_sg(dev, sg, nents, dir, attrs); -} - -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr; - - kmemcheck_mark_initialized(page_address(page) + offset, size); - BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(dev, page, offset, size, dir, NULL); - debug_dma_map_page(dev, page, offset, size, dir, addr, false); - - return addr; -} - -static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, NULL); - debug_dma_unmap_page(dev, addr, size, dir, false); -} - -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, - size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(dev, addr, size, dir); - debug_dma_sync_single_for_cpu(dev, addr, size, dir); -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, - enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) - ops->sync_single_for_device(dev, addr, size, dir); - debug_dma_sync_single_for_device(dev, addr, size, dir); -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_cpu) - ops->sync_single_for_cpu(dev, addr + offset, size, dir); - debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); -} - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t addr, - unsigned long offset, - size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_single_for_device) - ops->sync_single_for_device(dev, addr + offset, size, dir); - debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_cpu) - ops->sync_sg_for_cpu(dev, sg, nelems, dir); - debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (ops->sync_sg_for_device) - ops->sync_sg_for_device(dev, sg, nelems, dir); - debug_dma_sync_sg_for_device(dev, sg, nelems, dir); - -} - -#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) -#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) -#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) -#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) - -extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size); - -void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, - pgprot_t prot, const void *caller); - -void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller); -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); - -/** - * dma_mmap_attrs - map a coherent DMA allocation into user space - * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices - * @vma: vm_area_struct describing requested user mapping - * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs - * @handle: device-view address returned from dma_alloc_attrs - * @size: size of memory originally requested in dma_alloc_attrs - * @attrs: attributes of mapping properties requested in dma_alloc_attrs - * - * Map a coherent DMA buffer previously allocated by dma_alloc_attrs - * into user space. The coherent DMA buffer must not be freed by the - * driver until the user space mapping has been released. - */ -static inline int -dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, - dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->mmap) - return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); -} - -#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL) - -int -dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, size_t size); - -static inline int -dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, - dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - BUG_ON(!ops); - if (ops->get_sgtable) - return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, - attrs); - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); -} - -#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL) - -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev, flag) (true) -#endif - -static inline void *dma_alloc_attrs(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *cpu_addr; - - BUG_ON(!ops); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr)) - return cpu_addr; - - if (!arch_dma_alloc_attrs(&dev, &flag)) - return NULL; - if (!ops->alloc) - return NULL; - - cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); - return cpu_addr; -} - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!ops); - WARN_ON(irqs_disabled()); - - if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) - return; - - if (!ops->free) - return; - - debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); - ops->free(dev, size, cpu_addr, dma_handle, attrs); -} - -static inline void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) -{ - return dma_alloc_attrs(dev, size, dma_handle, flag, NULL); -} - -static inline void dma_free_coherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL); -} - -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - DEFINE_DMA_ATTRS(attrs); - - dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); - return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs); -} - -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - DEFINE_DMA_ATTRS(attrs); - - dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); - dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs); -} - -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - debug_dma_mapping_error(dev, dma_addr); - - if (get_dma_ops(dev)->mapping_error) - return get_dma_ops(dev)->mapping_error(dev, dma_addr); - -#ifdef DMA_ERROR_CODE - return dma_addr == DMA_ERROR_CODE; -#else - return 0; -#endif -} - -#ifndef HAVE_ARCH_DMA_SUPPORTED -static inline int dma_supported(struct device *dev, u64 mask) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - if (!ops) - return 0; - if (!ops->dma_supported) - return 1; - return ops->dma_supported(dev, mask); -} -#endif - -#ifndef HAVE_ARCH_DMA_SET_MASK -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - if (ops->set_dma_mask) - return ops->set_dma_mask(dev, mask); - - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - *dev->dma_mask = mask; - return 0; -} -#endif - -#endif diff --git a/include/linux/dma-attrs.h b/include/linux/dma-attrs.h index c8e1831..99c0be0 100644 --- a/include/linux/dma-attrs.h +++ b/include/linux/dma-attrs.h @@ -41,7 +41,6 @@ static inline void init_dma_attrs(struct dma_attrs *attrs) bitmap_zero(attrs->flags, __DMA_ATTRS_LONGS); } -#ifdef CONFIG_HAVE_DMA_ATTRS /** * dma_set_attr - set a specific attribute * @attr: attribute to set @@ -67,14 +66,5 @@ static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs) BUG_ON(attr >= DMA_ATTR_MAX); return test_bit(attr, attrs->flags); } -#else /* !CONFIG_HAVE_DMA_ATTRS */ -static inline void dma_set_attr(enum dma_attr attr, struct dma_attrs *attrs) -{ -} -static inline int dma_get_attr(enum dma_attr attr, struct dma_attrs *attrs) -{ - return 0; -} -#endif /* CONFIG_HAVE_DMA_ATTRS */ #endif /* _DMA_ATTR_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2e551e2..cc0517b 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -6,8 +6,12 @@ #include #include #include +#include #include #include +#include +#include +#include /* * A dma_addr_t can hold any valid DMA or bus address for the platform. @@ -86,7 +90,363 @@ static inline int is_device_dma_capable(struct device *dev) #ifdef CONFIG_HAS_DMA #include #else -#include +/* + * Define the dma api to allow compilation but not linking of + * dma dependent code. Code that depends on the dma-mapping + * API needs to set 'depends on HAS_DMA' in its Kconfig + */ +extern struct dma_map_ops bad_dma_ops; +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + return &bad_dma_ops; +} +#endif + +static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(ptr, size); + BUG_ON(!valid_dma_direction(dir)); + addr = ops->map_page(dev, virt_to_page(ptr), + (unsigned long)ptr & ~PAGE_MASK, size, + dir, attrs); + debug_dma_map_page(dev, virt_to_page(ptr), + (unsigned long)ptr & ~PAGE_MASK, size, + dir, addr, true); + return addr; +} + +static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, attrs); + debug_dma_unmap_page(dev, addr, size, dir, true); +} + +/* + * dma_maps_sg_attrs returns 0 on error and > 0 on success. + * It should never return a value < 0. + */ +static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + int i, ents; + struct scatterlist *s; + + for_each_sg(sg, s, nents, i) + kmemcheck_mark_initialized(sg_virt(s), s->length); + BUG_ON(!valid_dma_direction(dir)); + ents = ops->map_sg(dev, sg, nents, dir, attrs); + BUG_ON(ents < 0); + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} + +static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (ops->unmap_sg) + ops->unmap_sg(dev, sg, nents, dir, attrs); +} + +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(page_address(page) + offset, size); + BUG_ON(!valid_dma_direction(dir)); + addr = ops->map_page(dev, page, offset, size, dir, NULL); + debug_dma_map_page(dev, page, offset, size, dir, addr, false); + + return addr; +} + +static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->unmap_page) + ops->unmap_page(dev, addr, size, dir, NULL); + debug_dma_unmap_page(dev, addr, size, dir, false); +} + +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void dma_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} + +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_cpu) + ops->sync_single_for_cpu(dev, addr + offset, size, dir); + debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); +} + +static inline void dma_sync_single_range_for_device(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_single_for_device) + ops->sync_single_for_device(dev, addr + offset, size, dir); + debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_cpu) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops->sync_sg_for_device) + ops->sync_sg_for_device(dev, sg, nelems, dir); + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); + +} + +#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) +#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) +#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) +#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) + +extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller); + +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller); +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); + +/** + * dma_mmap_attrs - map a coherent DMA allocation into user space + * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices + * @vma: vm_area_struct describing requested user mapping + * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs + * @handle: device-view address returned from dma_alloc_attrs + * @size: size of memory originally requested in dma_alloc_attrs + * @attrs: attributes of mapping properties requested in dma_alloc_attrs + * + * Map a coherent DMA buffer previously allocated by dma_alloc_attrs + * into user space. The coherent DMA buffer must not be freed by the + * driver until the user space mapping has been released. + */ +static inline int +dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->mmap) + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); +} + +#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL) + +int +dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +static inline int +dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops->get_sgtable) + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, + attrs); + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); +} + +#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL) + +#ifndef arch_dma_alloc_attrs +#define arch_dma_alloc_attrs(dev, flag) (true) +#endif + +static inline void *dma_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + void *cpu_addr; + + BUG_ON(!ops); + + if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr)) + return cpu_addr; + + if (!arch_dma_alloc_attrs(&dev, &flag)) + return NULL; + if (!ops->alloc) + return NULL; + + cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + return cpu_addr; +} + +static inline void dma_free_attrs(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!ops); + WARN_ON(irqs_disabled()); + + if (dma_release_from_coherent(dev, get_order(size), cpu_addr)) + return; + + if (!ops->free) + return; + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + ops->free(dev, size, cpu_addr, dma_handle, attrs); +} + +static inline void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + return dma_alloc_attrs(dev, size, dma_handle, flag, NULL); +} + +static inline void dma_free_coherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL); +} + +static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + DEFINE_DMA_ATTRS(attrs); + + dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); + return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs); +} + +static inline void dma_free_noncoherent(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle) +{ + DEFINE_DMA_ATTRS(attrs); + + dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs); + dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs); +} + +static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + debug_dma_mapping_error(dev, dma_addr); + + if (get_dma_ops(dev)->mapping_error) + return get_dma_ops(dev)->mapping_error(dev, dma_addr); + +#ifdef DMA_ERROR_CODE + return dma_addr == DMA_ERROR_CODE; +#else + return 0; +#endif +} + +#ifndef HAVE_ARCH_DMA_SUPPORTED +static inline int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops) + return 0; + if (!ops->dma_supported) + return 1; + return ops->dma_supported(dev, mask); +} +#endif + +#ifndef HAVE_ARCH_DMA_SET_MASK +static inline int dma_set_mask(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->set_dma_mask) + return ops->set_dma_mask(dev, mask); + + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + *dev->dma_mask = mask; + return 0; +} #endif static inline u64 dma_get_mask(struct device *dev) @@ -259,22 +619,6 @@ static inline void dmam_release_declared_memory(struct device *dev) } #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ -#ifndef CONFIG_HAVE_DMA_ATTRS -struct dma_attrs; - -#define dma_map_single_attrs(dev, cpu_addr, size, dir, attrs) \ - dma_map_single(dev, cpu_addr, size, dir) - -#define dma_unmap_single_attrs(dev, dma_addr, size, dir, attrs) \ - dma_unmap_single(dev, dma_addr, size, dir) - -#define dma_map_sg_attrs(dev, sgl, nents, dir, attrs) \ - dma_map_sg(dev, sgl, nents, dir) - -#define dma_unmap_sg_attrs(dev, sgl, nents, dir, attrs) \ - dma_unmap_sg(dev, sgl, nents, dir) - -#else static inline void *dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { @@ -300,7 +644,6 @@ static inline int dma_mmap_writecombine(struct device *dev, dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); } -#endif /* CONFIG_HAVE_DMA_ATTRS */ #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME -- cgit v0.10.2 From 20d666e41166f8023ff3d960e832d87ded18c5c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2016 15:02:09 -0800 Subject: dma-mapping: remove This wasn't an asm-generic header to start with, and can be merged into dma-mapping.h trivially. Signed-off-by: Christoph Hellwig Cc: "David S. Miller" Cc: Aurelien Jacquiot Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jesper Nilsson Cc: Koichi Yasutake Cc: Ley Foon Tan Cc: Mark Salter Cc: Mikael Starvik Cc: Steven Miao Cc: Vineet Gupta Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h index 87b7a7d..3fc1170 100644 --- a/arch/xtensa/include/asm/dma-mapping.h +++ b/arch/xtensa/include/asm/dma-mapping.h @@ -13,8 +13,6 @@ #include #include -#include - #include #include diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index 381e39d..d799662 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c @@ -12,7 +12,6 @@ #include #include #include -#include /* * Managed DMA API @@ -167,7 +166,7 @@ void dmam_free_noncoherent(struct device *dev, size_t size, void *vaddr, } EXPORT_SYMBOL(dmam_free_noncoherent); -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT static void dmam_coherent_decl_release(struct device *dev, void *res) { diff --git a/include/asm-generic/dma-coherent.h b/include/asm-generic/dma-coherent.h deleted file mode 100644 index 0297e58..0000000 --- a/include/asm-generic/dma-coherent.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef DMA_COHERENT_H -#define DMA_COHERENT_H - -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT -/* - * These three functions are only for dma allocator. - * Don't use them in device drivers. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret); -int dma_release_from_coherent(struct device *dev, int order, void *vaddr); - -int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, size_t size, int *ret); -/* - * Standard interface - */ -#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY -int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags); - -void dma_release_declared_memory(struct device *dev); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size); -#else -#define dma_alloc_from_coherent(dev, size, handle, ret) (0) -#define dma_release_from_coherent(dev, order, vaddr) (0) -#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0) -#endif - -#endif diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index cc0517b..d6b575b 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -11,7 +11,6 @@ #include #include #include -#include /* * A dma_addr_t can hold any valid DMA or bus address for the platform. @@ -87,6 +86,23 @@ static inline int is_device_dma_capable(struct device *dev) return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; } +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT +/* + * These three functions are only for dma allocator. + * Don't use them in device drivers. + */ +int dma_alloc_from_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret); +int dma_release_from_coherent(struct device *dev, int order, void *vaddr); + +int dma_mmap_from_coherent(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, size_t size, int *ret); +#else +#define dma_alloc_from_coherent(dev, size, handle, ret) (0) +#define dma_release_from_coherent(dev, order, vaddr) (0) +#define dma_mmap_from_coherent(dev, vma, vaddr, order, ret) (0) +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ + #ifdef CONFIG_HAS_DMA #include #else @@ -568,7 +584,13 @@ static inline int dma_get_cache_alignment(void) #define DMA_MEMORY_INCLUDES_CHILDREN 0x04 #define DMA_MEMORY_EXCLUSIVE 0x08 -#ifndef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT +int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags); +void dma_release_declared_memory(struct device *dev); +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size); +#else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) @@ -587,7 +609,7 @@ dma_mark_declared_memory_occupied(struct device *dev, { return ERR_PTR(-EBUSY); } -#endif +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ /* * Managed DMA API @@ -600,13 +622,13 @@ extern void *dmam_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); extern void dmam_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); -#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT extern int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); extern void dmam_release_declared_memory(struct device *dev); -#else /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, gfp_t gfp) @@ -617,7 +639,7 @@ static inline int dmam_declare_coherent_memory(struct device *dev, static inline void dmam_release_declared_memory(struct device *dev) { } -#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline void *dma_alloc_writecombine(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) -- cgit v0.10.2 From 8e99469ab0f821bea77625cd4775ca529d4ca7d4 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 20 Jan 2016 15:02:12 -0800 Subject: dma-mapping: use offset_in_page macro Use offset_in_page macro instead of (addr & ~PAGE_MASK). Signed-off-by: Geliang Tang Acked-by: Will Deacon Cc: Christian Borntraeger Cc: Joerg Roedel Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index d6b575b..75857cd 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -129,10 +129,10 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, kmemcheck_mark_initialized(ptr, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, + offset_in_page(ptr), size, dir, attrs); debug_dma_map_page(dev, virt_to_page(ptr), - (unsigned long)ptr & ~PAGE_MASK, size, + offset_in_page(ptr), size, dir, addr, true); return addr; } -- cgit v0.10.2 From 545b5e2ad4771d23d4c67d0bcc18babd2070df13 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jan 2016 15:02:15 -0800 Subject: memstick: use sector_div instead of do_div do_div is the wrong way to divide a sector_t, as it is less efficient when sector_t is 32-bit wide. With the upcoming do_div optimizations, the kernel starts warning about this: drivers/memstick/core/ms_block.c: In function 'msb_io_work': include/asm-generic/div64.h:207:28: warning: comparison of distinct pointer types lacks a cast This changes the code to use sector_div instead, which always produces optimal code. Signed-off-by: Arnd Bergmann Cc: Maxim Levitsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 24f2f84..84abf9d 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1909,7 +1909,7 @@ static void msb_io_work(struct work_struct *work) lba = blk_rq_pos(msb->req); sector_div(lba, msb->page_size / 512); - page = do_div(lba, msb->pages_in_block); + page = sector_div(lba, msb->pages_in_block); if (rq_data_dir(msb->req) == READ) error = msb_do_read_request(msb, lba, page, sg, -- cgit v0.10.2 From 6d378dac7c4905db38f8127c4e618f0f627a4ced Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:18 -0800 Subject: mm: memcontrol: drop unused @css argument in memcg_init_kmem This series adds accounting of the historical "kmem" memory consumers to the cgroup2 memory controller. These consumers include the dentry cache, the inode cache, kernel stack pages, and a few others that are pointed out in patch 7/8. The footprint of these consumers is directly tied to userspace activity in common workloads, and so they have to be part of the minimally viable configuration in order to present a complete feature to our users. The cgroup2 interface of the memory controller is far from complete, but this series, along with the socket memory accounting series, provides the final semantic changes for the existing memory knobs in the cgroup2 interface, which is scheduled for initial release in the next merge window. This patch (of 8): Remove unused css argument frmo memcg_init_kmem() Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h index 01ff7c6..020c2de 100644 --- a/include/net/tcp_memcontrol.h +++ b/include/net/tcp_memcontrol.h @@ -4,6 +4,7 @@ struct cgroup_subsys; struct mem_cgroup; -int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss); +int tcp_init_cgroup(struct mem_cgroup *memcg); void tcp_destroy_cgroup(struct mem_cgroup *memcg); + #endif /* _TCP_MEMCG_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0eda673..f21f29c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3583,7 +3583,7 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +static int memcg_init_kmem(struct mem_cgroup *memcg) { int ret; @@ -3591,7 +3591,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (ret) return ret; - return tcp_init_cgroup(memcg, ss); + return tcp_init_cgroup(memcg); } static void memcg_deactivate_kmem(struct mem_cgroup *memcg) @@ -4274,7 +4274,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + ret = memcg_init_kmem(memcg); if (ret) return ret; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 18bc7f7..133eb5e 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,7 +6,7 @@ #include #include -int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +int tcp_init_cgroup(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct page_counter *counter_parent = NULL; -- cgit v0.10.2 From b15aac110a45c52d7f47ab8ee2d68f98044cfe6c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:21 -0800 Subject: mm: memcontrol: remove double kmem page_counter init The kmem page_counter's limit is initialized to PAGE_COUNTER_MAX inside mem_cgroup_css_online(). There is no need to repeat this from memcg_propagate_kmem(). Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f21f29c..71dced1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2861,8 +2861,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +static int memcg_activate_kmem(struct mem_cgroup *memcg) { int err = 0; int memcg_id; @@ -2897,13 +2896,6 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, goto out; } - /* - * We couldn't have accounted to this cgroup, because it hasn't got - * activated yet, so this should succeed. - */ - err = page_counter_limit(&memcg->kmem, nr_pages); - VM_BUG_ON(err); - static_branch_inc(&memcg_kmem_enabled_key); /* * A memory cgroup is considered kmem-active as soon as it gets @@ -2924,10 +2916,14 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, int ret; mutex_lock(&memcg_limit_mutex); - if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, limit); - else - ret = page_counter_limit(&memcg->kmem, limit); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_is_active(memcg)) { + ret = memcg_activate_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -2946,7 +2942,7 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * after this point, because it has at least one child already. */ if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); + ret = memcg_activate_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } -- cgit v0.10.2 From 567e9ab2e614e55feca20e8bcb54b629e9cc1a3b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:24 -0800 Subject: mm: memcontrol: give the kmem states more descriptive names On any given memcg, the kmem accounting feature has three separate states: not initialized, structures allocated, and actively accounting slab memory. These are represented through a combination of the kmem_acct_activated and kmem_acct_active flags, which is confusing. Convert to a kmem_state enum with the states NONE, ALLOCATED, and ONLINE. Then rename the functions to modify the state accordingly. This follows the nomenclature of css object states more closely. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 189f04d..54dab4d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -152,6 +152,12 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; +enum memcg_kmem_state { + KMEM_NONE, + KMEM_ALLOCATED, + KMEM_ONLINE, +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -233,8 +239,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; - bool kmem_acct_activated; - bool kmem_acct_active; + enum memcg_kmem_state kmem_state; #endif int last_scanned_node; @@ -750,9 +755,9 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +static inline bool memcg_kmem_online(struct mem_cgroup *memcg) { - return memcg->kmem_acct_active; + return memcg->kmem_state == KMEM_ONLINE; } /* @@ -850,7 +855,7 @@ static inline bool memcg_kmem_enabled(void) return false; } -static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +static inline bool memcg_kmem_online(struct mem_cgroup *memcg) { return false; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 71dced1..24b6bde 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2378,7 +2378,7 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) return 0; if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) @@ -2861,14 +2861,13 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } #ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg) +static int memcg_online_kmem(struct mem_cgroup *memcg) { int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_acct_activated); - BUG_ON(memcg->kmem_acct_active); + BUG_ON(memcg->kmem_state); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -2898,14 +2897,13 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg) static_branch_inc(&memcg_kmem_enabled_key); /* - * A memory cgroup is considered kmem-active as soon as it gets + * A memory cgroup is considered kmem-online as soon as it gets * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ memcg->kmemcg_id = memcg_id; - memcg->kmem_acct_activated = true; - memcg->kmem_acct_active = true; + memcg->kmem_state = KMEM_ONLINE; out: return err; } @@ -2917,8 +2915,8 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, mutex_lock(&memcg_limit_mutex); /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_is_active(memcg)) { - ret = memcg_activate_kmem(memcg); + if (!memcg_kmem_online(memcg)) { + ret = memcg_online_kmem(memcg); if (ret) goto out; } @@ -2938,11 +2936,12 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) mutex_lock(&memcg_limit_mutex); /* - * If the parent cgroup is not kmem-active now, it cannot be activated - * after this point, because it has at least one child already. + * If the parent cgroup is not kmem-online now, it cannot be + * onlined after this point, because it has at least one child + * already. */ - if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg); + if (memcg_kmem_online(parent)) + ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } @@ -3590,22 +3589,21 @@ static int memcg_init_kmem(struct mem_cgroup *memcg) return tcp_init_cgroup(memcg); } -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; struct mem_cgroup *parent, *child; int kmemcg_id; - if (!memcg->kmem_acct_active) + if (memcg->kmem_state != KMEM_ONLINE) return; - /* - * Clear the 'active' flag before clearing memcg_caches arrays entries. - * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it - * guarantees no cache will be created for this cgroup after we are - * done (see memcg_create_kmem_cache()). + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). */ - memcg->kmem_acct_active = false; + memcg->kmem_state = KMEM_ALLOCATED; memcg_deactivate_kmem_caches(memcg); @@ -3636,9 +3634,9 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg) memcg_free_cache_id(kmemcg_id); } -static void memcg_destroy_kmem(struct mem_cgroup *memcg) +static void memcg_free_kmem(struct mem_cgroup *memcg) { - if (memcg->kmem_acct_activated) { + if (memcg->kmem_state == KMEM_ALLOCATED) { memcg_destroy_kmem_caches(memcg); static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); @@ -3651,11 +3649,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_destroy_kmem(struct mem_cgroup *memcg) +static void memcg_free_kmem(struct mem_cgroup *memcg) { } #endif @@ -4308,7 +4306,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); - memcg_deactivate_kmem(memcg); + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); } @@ -4324,7 +4322,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_destroy_kmem(memcg); + memcg_free_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/slab_common.c b/mm/slab_common.c index e016178..8c262e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -503,10 +503,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* - * The memory cgroup could have been deactivated while the cache + * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) goto out_unlock; idx = memcg_cache_id(memcg); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ac8695..05dd182 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -411,7 +411,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_is_active(memcg)) + if (memcg && !memcg_kmem_online(memcg)) return 0; if (nr_scanned == 0) -- cgit v0.10.2 From 8e0a891213fbddcec231c9d1d7577c320c77a25a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:26 -0800 Subject: mm: memcontrol: group kmem init and exit functions together Put all the related code to setup and teardown the kmem accounting state into the same location. No functional change intended. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 24b6bde..3dd9fe3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2945,12 +2945,88 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) mutex_unlock(&memcg_limit_mutex); return ret; } + +static int memcg_init_kmem(struct mem_cgroup *memcg) +{ + int ret; + + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + + return tcp_init_cgroup(memcg); +} + +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (memcg->kmem_state != KMEM_ONLINE) + return; + /* + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). + */ + memcg->kmem_state = KMEM_ALLOCATED; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ + if (memcg->kmem_state == KMEM_ALLOCATED) { + memcg_destroy_kmem_caches(memcg); + static_branch_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } + tcp_destroy_cgroup(memcg); +} #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { return -EINVAL; } +static int memcg_init_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -3577,87 +3653,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return tcp_init_cgroup(memcg); -} - -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; - int kmemcg_id; - - if (memcg->kmem_state != KMEM_ONLINE) - return; - /* - * Clear the online state before clearing memcg_caches array - * entries. The slab_mutex in memcg_deactivate_kmem_caches() - * guarantees that no cache will be created for this cgroup - * after we are done (see memcg_create_kmem_cache()). - */ - memcg->kmem_state = KMEM_ALLOCATED; - - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). - */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; - } - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); - - memcg_free_cache_id(kmemcg_id); -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - if (memcg->kmem_state == KMEM_ALLOCATED) { - memcg_destroy_kmem_caches(memcg); - static_branch_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); - } - tcp_destroy_cgroup(memcg); -} -#else -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - return 0; -} - -static void memcg_offline_kmem(struct mem_cgroup *memcg) -{ -} - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -- cgit v0.10.2 From 3893e302f6a377c4ef0f077f190bf760bf84e0be Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:29 -0800 Subject: mm: memcontrol: separate kmem code from legacy tcp accounting code The cgroup2 memory controller will include important in-kernel memory consumers per default, including socket memory, but it will no longer carry the historic tcp control interface. Separate the kmem state init from the tcp control interface init in preparation for that. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dd9fe3..7f8219b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2946,17 +2946,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) return ret; } -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return tcp_init_cgroup(memcg); -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -3009,7 +2998,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } - tcp_destroy_cgroup(memcg); } #else static int memcg_update_kmem_limit(struct mem_cgroup *memcg, @@ -3017,16 +3005,9 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, { return -EINVAL; } -static int memcg_init_kmem(struct mem_cgroup *memcg) -{ - return 0; -} static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ /* @@ -4263,9 +4244,14 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg); +#ifdef CONFIG_MEMCG_KMEM + ret = memcg_propagate_kmem(memcg); if (ret) return ret; + ret = tcp_init_cgroup(memcg); + if (ret) + return ret; +#endif #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) @@ -4317,11 +4303,16 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_free_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); #endif + +#ifdef CONFIG_MEMCG_KMEM + memcg_free_kmem(memcg); + tcp_destroy_cgroup(memcg); +#endif + __mem_cgroup_free(memcg); } -- cgit v0.10.2 From 127424c86bb6cb87f0b563d9fdcfbbaf3c86ecec Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:32 -0800 Subject: mm: memcontrol: move kmem accounting code to CONFIG_MEMCG The cgroup2 memory controller will account important in-kernel memory consumers per default. Move all necessary components to CONFIG_MEMCG. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 2a6b994..cb0ba9f 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -40,7 +40,7 @@ struct list_lru_node { spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg *memcg_lrus; #endif @@ -48,7 +48,7 @@ struct list_lru_node { struct list_lru { struct list_lru_node *node; -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) struct list_head list; #endif }; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 54dab4d..a87704e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -236,7 +236,7 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif -#if defined(CONFIG_MEMCG_KMEM) +#ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; @@ -735,7 +735,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) extern struct static_key_false memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; @@ -891,5 +891,6 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 02dabf2..f1e81e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1476,10 +1476,10 @@ struct task_struct { unsigned in_iowait:1; #ifdef CONFIG_MEMCG unsigned memcg_may_oom:1; -#endif -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB unsigned memcg_kmem_skip_account:1; #endif +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 3ffee74..3627d5c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -86,7 +86,7 @@ #else # define SLAB_FAILSLAB 0x00000000UL #endif -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) # define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ #else # define SLAB_ACCOUNT 0x00000000UL diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 33d0490..cf139d3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -69,7 +69,8 @@ struct kmem_cache { */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -#ifdef CONFIG_MEMCG_KMEM + +#ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; #endif diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3388511..b7e57927 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -84,7 +84,7 @@ struct kmem_cache { #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ #ifdef CONFIG_SYSFS diff --git a/mm/list_lru.c b/mm/list_lru.c index afc71ea..1d05cb9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru) static void list_lru_unregister(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7f8219b..fe51d5e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -297,7 +297,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_css(css); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -349,7 +349,7 @@ void memcg_put_cache_ids(void) DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) @@ -2203,7 +2203,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_alloc_cache_id(void) { int id, size; @@ -2424,7 +2424,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2860,7 +2860,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_online_kmem(struct mem_cgroup *memcg) { int err = 0; @@ -2908,24 +2908,6 @@ out: return err; } -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - int ret; - - mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } - ret = page_counter_limit(&memcg->kmem, limit); -out: - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static int memcg_propagate_kmem(struct mem_cgroup *memcg) { int ret = 0; @@ -3000,16 +2982,45 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} +#endif /* !CONFIG_SLOB */ + +#ifdef CONFIG_MEMCG_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - return -EINVAL; + int ret; + + mutex_lock(&memcg_limit_mutex); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_online(memcg)) { + ret = memcg_online_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: + mutex_unlock(&memcg_limit_mutex); + return ret; } -static void memcg_offline_kmem(struct mem_cgroup *memcg) +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long limit) { + return -EINVAL; } #endif /* CONFIG_MEMCG_KMEM */ + /* * The user of this function is... * RES_LIMIT. @@ -4182,7 +4193,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -4244,10 +4255,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) } mutex_unlock(&memcg_create_mutex); -#ifdef CONFIG_MEMCG_KMEM ret = memcg_propagate_kmem(memcg); if (ret) return ret; + +#ifdef CONFIG_MEMCG_KMEM ret = tcp_init_cgroup(memcg); if (ret) return ret; @@ -4308,8 +4320,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) static_branch_dec(&memcg_sockets_enabled_key); #endif -#ifdef CONFIG_MEMCG_KMEM memcg_free_kmem(memcg); + +#ifdef CONFIG_MEMCG_KMEM tcp_destroy_cgroup(memcg); #endif diff --git a/mm/slab.h b/mm/slab.h index c63b869..834ad24 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -173,7 +173,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. @@ -251,7 +251,7 @@ static __always_inline int memcg_charge_slab(struct page *page, extern void slab_init_memcg_params(struct kmem_cache *); -#else /* !CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG && !CONFIG_SLOB */ #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -292,7 +292,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 8c262e6..b50aef0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -128,7 +128,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, return i; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.is_root_cache = true; @@ -221,7 +221,7 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* * Find a mergeable slab cache @@ -477,7 +477,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier) } } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -689,7 +689,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s, { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ void slab_kmem_cache_release(struct kmem_cache *s) { @@ -1123,7 +1123,7 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); diff --git a/mm/slub.c b/mm/slub.c index b21fd24..2e1355a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5207,7 +5207,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { struct kmem_cache *c; @@ -5242,7 +5242,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, static void memcg_propagate_slab_attrs(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG int i; char *buffer = NULL; struct kmem_cache *root_cache; @@ -5328,7 +5328,7 @@ static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!is_root_cache(s)) return s->memcg_params.root_cache->memcg_kset; #endif @@ -5405,7 +5405,7 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out_del_kobj; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { @@ -5438,7 +5438,7 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); -- cgit v0.10.2 From 52c29b04823cb1bab2805336b80866325fe2bc3f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:35 -0800 Subject: mm: memcontrol: account "kmem" consumers in cgroup2 memory controller The original cgroup memory controller has an extension to account slab memory (and other "kernel memory" consumers) in a separate "kmem" counter, once the user set an explicit limit on that "kmem" pool. However, this includes various consumers whose sizes are directly linked to userspace activity. Accounting them as an optional "kmem" extension is problematic for several reasons: 1. It leaves the main memory interface with incomplete semantics. A user who puts their workload into a cgroup and configures a memory limit does not expect us to leave holes in the containment as big as the dentry and inode cache, or the kernel stack pages. 2. If the limit set on this random historical subgroup of consumers is reached, subsequent allocations will fail even when the main memory pool available to the cgroup is not yet exhausted and/or has reclaimable memory in it. 3. Calling it 'kernel memory' is misleading. The dentry and inode caches are no more 'kernel' (or no less 'user') memory than the page cache itself. Treating these consumers as different classes is a historical implementation detail that should not leak to users. So, in addition to page cache, anonymous memory, and network socket memory, account the following memory consumers per default in the cgroup2 memory controller: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This should give us reasonable memory isolation for most common workloads out of the box. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe51d5e..9e7a4e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2381,13 +2381,14 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!memcg_kmem_online(memcg)) return 0; - if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) - return -ENOMEM; - ret = try_charge(memcg, gfp, nr_pages); - if (ret) { - page_counter_uncharge(&memcg->kmem, nr_pages); + if (ret) return ret; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + cancel_charge(memcg, nr_pages); + return -ENOMEM; } page->mem_cgroup = memcg; @@ -2416,7 +2417,9 @@ void __memcg_kmem_uncharge(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - page_counter_uncharge(&memcg->kmem, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); @@ -2922,7 +2925,8 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * onlined after this point, because it has at least one child * already. */ - if (memcg_kmem_online(parent)) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) || + memcg_kmem_online(parent)) ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; -- cgit v0.10.2 From 04823c833b3eaef7816e28e3727124394f6bb3c3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:38 -0800 Subject: mm: memcontrol: allow to disable kmem accounting for cgroup2 Kmem accounting might incur overhead that some users can't put up with. Besides, the implementation is still considered unstable. So let's provide a way to disable it for those users who aren't happy with it. To disable kmem accounting for cgroup2, pass cgroup.memory=nokmem at boot time. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 3ea869d..cfb2c0f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -611,6 +611,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. cgroup.memory= [KNL] Pass options to the cgroup memory controller. Format: nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. checkreqprot [SELINUX] Set initial checkreqprot flag value. Format: { "0" | "1" } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e7a4e5..2239e6d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,6 +83,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; +/* Kernel memory accounting disabled? */ +static bool cgroup_memory_nokmem; + /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP int do_swap_account __read_mostly; @@ -2925,8 +2928,8 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) * onlined after this point, because it has at least one child * already. */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) || - memcg_kmem_online(parent)) + if (memcg_kmem_online(parent) || + (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; @@ -5638,6 +5641,8 @@ static int __init cgroup_memory(char *s) continue; if (!strcmp(token, "nosocket")) cgroup_memory_nosocket = true; + if (!strcmp(token, "nokmem")) + cgroup_memory_nokmem = true; } return 0; } -- cgit v0.10.2 From 489c2a20a414351fe0813a727c34600c0f7292ae Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:41 -0800 Subject: mm: memcontrol: introduce CONFIG_MEMCG_LEGACY_KMEM Let the user know that CONFIG_MEMCG_KMEM does not apply to the cgroup2 interface. This also makes legacy-only code sections stand out better. [arnd@arndb.de: mm: memcontrol: only manage socket pressure for CONFIG_INET] Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Acked-by: Vladimir Davydov Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a87704e..2bb14d02 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,7 +233,7 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu __percpu *stat; -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif #ifndef CONFIG_SLOB @@ -717,7 +717,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (memcg->tcp_mem.memory_pressure) return true; #endif diff --git a/init/Kconfig b/init/Kconfig index 5b86082..a0a15ce 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -964,10 +964,13 @@ config MEMCG_SWAP_ENABLED For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). +config MEMCG_LEGACY_KMEM + bool config MEMCG_KMEM - bool "Memory Resource Controller Kernel Memory accounting" + bool "Legacy Memory Resource Controller Kernel Memory accounting" depends on MEMCG depends on SLUB || SLAB + select MEMCG_LEGACY_KMEM help The Kernel Memory extension for Memory Resource Controller can limit the amount of memory used by kernel objects in the system. Those are @@ -1071,6 +1074,11 @@ config CGROUP_FREEZER Provides a way to freeze and unfreeze all tasks in a cgroup. + This option affects the ORIGINAL cgroup interface. The cgroup2 memory + controller includes important in-kernel memory consumers per default. + + If you're using cgroup2, say N. + config CGROUP_HUGETLB bool "HugeTLB controller" depends on HUGETLB_PAGE diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2239e6d..92e8ab6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3001,7 +3001,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { @@ -3025,7 +3025,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, { return -EINVAL; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG_LEGACY_KMEM */ /* @@ -4039,7 +4039,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4266,13 +4266,13 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_INET +#ifdef CONFIG_MEMCG_LEGACY_KMEM ret = tcp_init_cgroup(memcg); if (ret) return ret; #endif -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); #endif @@ -4329,7 +4329,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_free_kmem(memcg); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) tcp_destroy_cgroup(memcg); #endif @@ -5558,7 +5558,7 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) goto out; #endif @@ -5587,7 +5587,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { gfp_t gfp_mask = GFP_KERNEL; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *counter; @@ -5619,7 +5619,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9a6c070..89b1d44 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -275,6 +275,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); +#ifdef CONFIG_INET if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that @@ -286,6 +287,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, */ memcg->socket_pressure = jiffies + HZ; } +#endif } } diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c29809f..bee5055 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o +obj-$(CONFIG_MEMCG_LEGACY_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ -- cgit v0.10.2 From d55f90bfab40e3b5db323711d28186ff09461692 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:44 -0800 Subject: net: drop tcp_memcontrol.c tcp_memcontrol.c only contains legacy memory.tcp.kmem.* file definitions and mem_cgroup->tcp_mem init/destroy stuff. This doesn't belong to network subsys. Let's move it to memcontrol.c. This also allows us to reuse generic code for handling legacy memcg files. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: "David S. Miller" Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h deleted file mode 100644 index 020c2de..0000000 --- a/include/net/tcp_memcontrol.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _TCP_MEMCG_H -#define _TCP_MEMCG_H - -struct cgroup_subsys; -struct mem_cgroup; - -int tcp_init_cgroup(struct mem_cgroup *memcg); -void tcp_destroy_cgroup(struct mem_cgroup *memcg); - -#endif /* _TCP_MEMCG_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 92e8ab6..1589670 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,7 +66,6 @@ #include "internal.h" #include #include -#include #include "slab.h" #include @@ -242,6 +241,7 @@ enum res_type { _MEMSWAP, _OOM_TYPE, _KMEM, + _TCP, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -2842,6 +2842,11 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + case _TCP: + counter = &memcg->tcp_mem.memory_allocated; + break; +#endif default: BUG(); } @@ -3028,6 +3033,48 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, #endif /* CONFIG_MEMCG_LEGACY_KMEM */ +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + int ret; + + mutex_lock(&memcg_limit_mutex); + + ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit); + if (ret) + goto out; + + if (!memcg->tcp_mem.active) { + /* + * The active flag needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See sock_update_memcg() for + * details, and note that we don't mark any socket as belonging + * to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in sock_update_memcg(), + * because when this value change, the code to process it is not + * patched in yet. + */ + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcp_mem.active = true; + } +out: + mutex_unlock(&memcg_limit_mutex); + return ret; +} +#else +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + return -EINVAL; +} +#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */ + /* * The user of this function is... * RES_LIMIT. @@ -3060,6 +3107,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); break; + case _TCP: + ret = memcg_update_tcp_limit(memcg, nr_pages); + break; } break; case RES_SOFT_LIMIT: @@ -3086,6 +3136,11 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + case _TCP: + counter = &memcg->tcp_mem.memory_allocated; + break; +#endif default: BUG(); } @@ -4072,6 +4127,31 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif +#ifdef CONFIG_INET + { + .name = "kmem.tcp.limit_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.failcnt", + .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, +#endif #endif { }, /* terminate */ }; @@ -4241,6 +4321,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + page_counter_init(&memcg->tcp_mem.memory_allocated, + &parent->tcp_mem.memory_allocated); +#endif /* * No need to take a reference to the parent because cgroup @@ -4252,6 +4336,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); +#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); +#endif /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4267,12 +4354,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) return ret; #ifdef CONFIG_INET -#ifdef CONFIG_MEMCG_LEGACY_KMEM - ret = tcp_init_cgroup(memcg); - if (ret) - return ret; -#endif - if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); #endif @@ -4330,7 +4411,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_free_kmem(memcg); #if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) - tcp_destroy_cgroup(memcg); + if (memcg->tcp_mem.active) + static_branch_dec(&memcg_sockets_enabled_key); #endif __mem_cgroup_free(memcg); diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bee5055..62c049b 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_MEMCG_LEGACY_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 46ce410..4d367b4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,7 +24,6 @@ #include #include #include -#include static int zero; static int one = 1; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7d1fb5..5ced3e4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -73,7 +73,6 @@ #include #include #include -#include #include #include diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c deleted file mode 100644 index 133eb5e..0000000 --- a/net/ipv4/tcp_memcontrol.c +++ /dev/null @@ -1,200 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -int tcp_init_cgroup(struct mem_cgroup *memcg) -{ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - /* - * The root cgroup does not use page_counters, but rather, - * rely on the data already collected by the network - * subsystem - */ - if (memcg == root_mem_cgroup) - return 0; - - memcg->tcp_mem.memory_pressure = 0; - - if (parent) - counter_parent = &parent->tcp_mem.memory_allocated; - - page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent); - - return 0; -} - -void tcp_destroy_cgroup(struct mem_cgroup *memcg) -{ - if (memcg == root_mem_cgroup) - return; - - if (memcg->tcp_mem.active) - static_branch_dec(&memcg_sockets_enabled_key); -} - -static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - int ret; - - if (memcg == root_mem_cgroup) - return -EINVAL; - - ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages); - if (ret) - return ret; - - if (!memcg->tcp_mem.active) { - /* - * The active flag needs to be written after the static_key - * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. - * - * We need to do this, because static_keys will span multiple - * sites, but we can't control their order. If we mark a socket - * as accounted, but the accounting functions are not patched in - * yet, we'll lose accounting. - * - * We never race with the readers in sock_update_memcg(), - * because when this value change, the code to process it is not - * patched in yet. - */ - static_branch_inc(&memcg_sockets_enabled_key); - memcg->tcp_mem.active = true; - } - - return 0; -} - -enum { - RES_USAGE, - RES_LIMIT, - RES_MAX_USAGE, - RES_FAILCNT, -}; - -static DEFINE_MUTEX(tcp_limit_mutex); - -static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; - int ret = 0; - - buf = strstrip(buf); - - switch (of_cft(of)->private) { - case RES_LIMIT: - /* see memcontrol.c */ - ret = page_counter_memparse(buf, "-1", &nr_pages); - if (ret) - break; - mutex_lock(&tcp_limit_mutex); - ret = tcp_update_limit(memcg, nr_pages); - mutex_unlock(&tcp_limit_mutex); - break; - default: - ret = -EINVAL; - break; - } - return ret ?: nbytes; -} - -static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - - switch (cft->private) { - case RES_LIMIT: - if (memcg == root_mem_cgroup) - val = PAGE_COUNTER_MAX; - else - val = memcg->tcp_mem.memory_allocated.limit; - val *= PAGE_SIZE; - break; - case RES_USAGE: - if (memcg == root_mem_cgroup) - val = atomic_long_read(&tcp_memory_allocated); - else - val = page_counter_read(&memcg->tcp_mem.memory_allocated); - val *= PAGE_SIZE; - break; - case RES_FAILCNT: - if (memcg == root_mem_cgroup) - return 0; - val = memcg->tcp_mem.memory_allocated.failcnt; - break; - case RES_MAX_USAGE: - if (memcg == root_mem_cgroup) - return 0; - val = memcg->tcp_mem.memory_allocated.watermark; - val *= PAGE_SIZE; - break; - default: - BUG(); - } - return val; -} - -static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg; - - memcg = mem_cgroup_from_css(of_css(of)); - if (memcg == root_mem_cgroup) - return nbytes; - - switch (of_cft(of)->private) { - case RES_MAX_USAGE: - page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated); - break; - case RES_FAILCNT: - memcg->tcp_mem.memory_allocated.failcnt = 0; - break; - } - - return nbytes; -} - -static struct cftype tcp_files[] = { - { - .name = "kmem.tcp.limit_in_bytes", - .write = tcp_cgroup_write, - .read_u64 = tcp_cgroup_read, - .private = RES_LIMIT, - }, - { - .name = "kmem.tcp.usage_in_bytes", - .read_u64 = tcp_cgroup_read, - .private = RES_USAGE, - }, - { - .name = "kmem.tcp.failcnt", - .private = RES_FAILCNT, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { - .name = "kmem.tcp.max_usage_in_bytes", - .private = RES_MAX_USAGE, - .write = tcp_cgroup_reset, - .read_u64 = tcp_cgroup_read, - }, - { } /* terminate */ -}; - -static int __init tcp_memcontrol_init(void) -{ - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); - return 0; -} -__initcall(tcp_memcontrol_init); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4ad8edb..006396e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include -- cgit v0.10.2 From d886f4e483ce63a3304adc9eda87031b93341c28 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:47 -0800 Subject: mm: memcontrol: rein in the CONFIG space madness What CONFIG_INET and CONFIG_LEGACY_KMEM guard inside the memory controller code is insignificant, having these conditionals is not worth the complication and fragility that comes with them. [akpm@linux-foundation.org: rework mem_cgroup_css_free() statement ordering] Signed-off-by: Johannes Weiner Cc: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bb14d02..47995b4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -233,9 +233,11 @@ struct mem_cgroup { */ struct mem_cgroup_stat_cpu __percpu *stat; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) + unsigned long socket_pressure; + + /* Legacy tcp memory accounting */ struct cg_proto tcp_mem; -#endif + #ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; @@ -254,10 +256,6 @@ struct mem_cgroup { struct wb_domain cgwb_domain; #endif -#ifdef CONFIG_INET - unsigned long socket_pressure; -#endif - /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -712,15 +710,13 @@ void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); -#if defined(CONFIG_MEMCG) && defined(CONFIG_INET) +#ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (memcg->tcp_mem.memory_pressure) return true; -#endif do { if (time_before(jiffies, memcg->socket_pressure)) return true; diff --git a/init/Kconfig b/init/Kconfig index a0a15ce..2232080 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -964,20 +964,6 @@ config MEMCG_SWAP_ENABLED For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). -config MEMCG_LEGACY_KMEM - bool -config MEMCG_KMEM - bool "Legacy Memory Resource Controller Kernel Memory accounting" - depends on MEMCG - depends on SLUB || SLAB - select MEMCG_LEGACY_KMEM - help - The Kernel Memory extension for Memory Resource Controller can limit - the amount of memory used by kernel objects in the system. Those are - fundamentally different from the entities handled by the standard - Memory Controller, which are page-based, and can be swapped. Users of - the kmem extension can use it to guarantee that no group of processes - will ever exhaust kernel resources alone. config BLK_CGROUP bool "IO controller" @@ -1190,10 +1176,9 @@ config USER_NS to provide different user info for different servers. When user namespaces are enabled in the kernel it is - recommended that the MEMCG and MEMCG_KMEM options also be - enabled and that user-space use the memory control groups to - limit the amount of memory a memory unprivileged users can - use. + recommended that the MEMCG option also be enabled and that + user-space use the memory control groups to limit the amount + of memory a memory unprivileged users can use. If unsure, say N. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1589670..379f991 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2842,11 +2842,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) case _TCP: counter = &memcg->tcp_mem.memory_allocated; break; -#endif default: BUG(); } @@ -3006,7 +3004,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_LEGACY_KMEM static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { @@ -3024,16 +3021,7 @@ out: mutex_unlock(&memcg_limit_mutex); return ret; } -#else -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - return -EINVAL; -} -#endif /* CONFIG_MEMCG_LEGACY_KMEM */ - -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) { int ret; @@ -3068,12 +3056,6 @@ out: mutex_unlock(&memcg_limit_mutex); return ret; } -#else -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) -{ - return -EINVAL; -} -#endif /* CONFIG_MEMCG_LEGACY_KMEM && CONFIG_INET */ /* * The user of this function is... @@ -3136,11 +3118,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) case _TCP: counter = &memcg->tcp_mem.memory_allocated; break; -#endif default: BUG(); } @@ -4094,7 +4074,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_LEGACY_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4127,7 +4106,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif -#ifdef CONFIG_INET { .name = "kmem.tcp.limit_in_bytes", .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), @@ -4151,8 +4129,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#endif -#endif { }, /* terminate */ }; @@ -4280,15 +4256,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); + memcg->socket_pressure = jiffies; #ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif -#ifdef CONFIG_INET - memcg->socket_pressure = jiffies; -#endif return &memcg->css; free_out: @@ -4321,10 +4295,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) page_counter_init(&memcg->tcp_mem.memory_allocated, &parent->tcp_mem.memory_allocated); -#endif /* * No need to take a reference to the parent because cgroup @@ -4336,9 +4308,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); -#endif /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4353,10 +4323,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); -#endif /* * Make sure the memcg is initialized: mem_cgroup_iter() @@ -4403,18 +4371,13 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); -#ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); -#endif - - memcg_free_kmem(memcg); -#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET) if (memcg->tcp_mem.active) static_branch_dec(&memcg_sockets_enabled_key); -#endif + memcg_free_kmem(memcg); __mem_cgroup_free(memcg); } @@ -5613,8 +5576,6 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } -#ifdef CONFIG_INET - DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); @@ -5640,10 +5601,8 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) goto out; -#endif if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; out: @@ -5669,7 +5628,6 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { gfp_t gfp_mask = GFP_KERNEL; -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { struct page_counter *counter; @@ -5682,7 +5640,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) memcg->tcp_mem.memory_pressure = 1; return false; } -#endif + /* Don't block in the packet receive path */ if (in_softirq()) gfp_mask = GFP_NOWAIT; @@ -5701,19 +5659,16 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { -#ifdef CONFIG_MEMCG_LEGACY_KMEM if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); return; } -#endif + page_counter_uncharge(&memcg->memory, nr_pages); css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_INET */ - static int __init cgroup_memory(char *s) { char *token; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 89b1d44..9a6c070 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -275,7 +275,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, level = vmpressure_calc_level(scanned, reclaimed); -#ifdef CONFIG_INET if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that @@ -287,7 +286,6 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, */ memcg->socket_pressure = jiffies + HZ; } -#endif } } -- cgit v0.10.2 From 0db1529817b7b16226421f01470c5ba982c5f302 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:50 -0800 Subject: mm: memcontrol: flatten struct cg_proto There are no more external users of struct cg_proto, flatten the structure into struct mem_cgroup. Since using those struct members doesn't stand out as much anymore, add cgroup2 static branches to make it clearer which code is legacy. Suggested-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47995b4..a3869bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,12 +85,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct cg_proto { - struct page_counter memory_allocated; /* Current allocated memory. */ - int memory_pressure; - bool active; -}; - #ifdef CONFIG_MEMCG struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; @@ -169,8 +163,11 @@ struct mem_cgroup { /* Accounted resources */ struct page_counter memory; + + /* Legacy consumer-oriented counters */ struct page_counter memsw; struct page_counter kmem; + struct page_counter tcpmem; /* Normal memory consumption range */ unsigned long low; @@ -236,7 +233,8 @@ struct mem_cgroup { unsigned long socket_pressure; /* Legacy tcp memory accounting */ - struct cg_proto tcp_mem; + bool tcpmem_active; + int tcpmem_pressure; #ifndef CONFIG_SLOB /* Index in the kmem_cache->memcg_params.memcg_caches array */ @@ -715,7 +713,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { - if (memcg->tcp_mem.memory_pressure) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { if (time_before(jiffies, memcg->socket_pressure)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 379f991..6937f16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2843,7 +2843,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, counter = &memcg->kmem; break; case _TCP: - counter = &memcg->tcp_mem.memory_allocated; + counter = &memcg->tcpmem; break; default: BUG(); @@ -3028,11 +3028,11 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, limit); + ret = page_counter_limit(&memcg->tcpmem, limit); if (ret) goto out; - if (!memcg->tcp_mem.active) { + if (!memcg->tcpmem_active) { /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -3050,7 +3050,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) * patched in yet. */ static_branch_inc(&memcg_sockets_enabled_key); - memcg->tcp_mem.active = true; + memcg->tcpmem_active = true; } out: mutex_unlock(&memcg_limit_mutex); @@ -3119,7 +3119,7 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, counter = &memcg->kmem; break; case _TCP: - counter = &memcg->tcp_mem.memory_allocated; + counter = &memcg->tcpmem; break; default: BUG(); @@ -4295,8 +4295,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); - page_counter_init(&memcg->tcp_mem.memory_allocated, - &parent->tcp_mem.memory_allocated); + page_counter_init(&memcg->tcpmem, &parent->tcpmem); /* * No need to take a reference to the parent because cgroup @@ -4308,7 +4307,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcp_mem.memory_allocated, NULL); + page_counter_init(&memcg->tcpmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4374,7 +4373,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); - if (memcg->tcp_mem.active) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) static_branch_dec(&memcg_sockets_enabled_key); memcg_free_kmem(memcg); @@ -5601,7 +5600,7 @@ void sock_update_memcg(struct sock *sk) memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; @@ -5629,15 +5628,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) gfp_t gfp_mask = GFP_KERNEL; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - struct page_counter *counter; + struct page_counter *fail; - if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, - nr_pages, &counter)) { - memcg->tcp_mem.memory_pressure = 0; + if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { + memcg->tcpmem_pressure = 0; return true; } - page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); - memcg->tcp_mem.memory_pressure = 1; + page_counter_charge(&memcg->tcpmem, nr_pages); + memcg->tcpmem_pressure = 1; return false; } @@ -5660,8 +5658,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - page_counter_uncharge(&memcg->tcp_mem.memory_allocated, - nr_pages); + page_counter_uncharge(&memcg->tcpmem, nr_pages); return; } -- cgit v0.10.2 From 0b8f73e104285a4badf9d768d1c39b06d77d1f97 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:02:53 -0800 Subject: mm: memcontrol: clean up alloc, online, offline, free functions The creation and teardown of struct mem_cgroup is fairly messy and that has attracted mistakes and subtle bugs before. The main cause for this is that there is no clear model about what needs to happen when, and that attracts more chaos. So create one: 1. mem_cgroup_alloc() should allocate struct mem_cgroup and its auxiliary members and initialize work items, locks etc. so that the object it returns is fully initialized and in a neutral state. 2. mem_cgroup_css_alloc() will use mem_cgroup_alloc() to obtain a new memcg object and configure it and the system according to the role of the new memory-controlled cgroup in the hierarchy. 3. mem_cgroup_css_online() is no longer needed to synchronize with iterators, but it verifies css->id which isn't available earlier. 4. mem_cgroup_css_offline() implements stuff that needs to happen upon the user-visible destruction of a cgroup, which includes stopping all user interfacing as well as releasing certain structures when continued memory consumption would be unexpected at that point. 5. mem_cgroup_css_free() prepares the system and the memcg object for the object's disappearance, neutralizes its state, and then gives it back to mem_cgroup_free(). 6. mem_cgroup_free() releases struct mem_cgroup and auxiliary memory. [arnd@arndb.de: fix SLOB build regression] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3869bf..27123e5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,9 +181,6 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - /* css_online() has been completed */ - int initialized; - /* * Should the accounting and control be hierarchical, per subtree? */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6937f16..f6bc78f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,13 +250,6 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) -/* - * The memcg_create_mutex will be held whenever a new cgroup is created. - * As a consequence, any change that needs to protect against new child cgroups - * appearing has to hold it as well. - */ -static DEFINE_MUTEX(memcg_create_mutex); - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -899,17 +892,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css == &root->css) break; - if (css_tryget(css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - break; - - css_put(css); - } + if (css_tryget(css)) + break; memcg = NULL; } @@ -2690,14 +2674,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) { bool ret; - /* - * The lock does not prevent addition or deletion of children, but - * it prevents a new child from being initialized based on this - * parent in css_online(), so it's enough to decide whether - * hierarchically inherited attributes can still be changed or not. - */ - lockdep_assert_held(&memcg_create_mutex); - rcu_read_lock(); ret = css_next_child(NULL, &memcg->css); rcu_read_unlock(); @@ -2760,10 +2736,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - mutex_lock(&memcg_create_mutex); - if (memcg->use_hierarchy == val) - goto out; + return 0; /* * If parent's use_hierarchy is set, we can't make any modifications @@ -2782,9 +2756,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, } else retval = -EINVAL; -out: - mutex_unlock(&memcg_create_mutex); - return retval; } @@ -2872,37 +2843,14 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, #ifndef CONFIG_SLOB static int memcg_online_kmem(struct mem_cgroup *memcg) { - int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); - /* - * For simplicity, we won't allow this to be disabled. It also can't - * be changed if the cgroup has children already, or if tasks had - * already joined. - * - * If tasks join before we set the limit, a person looking at - * kmem.usage_in_bytes will have no way to determine when it took - * place, which makes the value quite meaningless. - * - * After it first became limited, changes in the value of the limit are - * of course permitted. - */ - mutex_lock(&memcg_create_mutex); - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - err = -EBUSY; - mutex_unlock(&memcg_create_mutex); - if (err) - goto out; - memcg_id = memcg_alloc_cache_id(); - if (memcg_id < 0) { - err = memcg_id; - goto out; - } + if (memcg_id < 0) + return memcg_id; static_branch_inc(&memcg_kmem_enabled_key); /* @@ -2913,17 +2861,14 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; -out: - return err; + + return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *parent, + struct mem_cgroup *memcg) { int ret = 0; - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - - if (!parent) - return 0; mutex_lock(&memcg_limit_mutex); /* @@ -2985,6 +2930,10 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) static void memcg_free_kmem(struct mem_cgroup *memcg) { + /* css_alloc() failed, offlining didn't happen */ + if (unlikely(memcg->kmem_state == KMEM_ONLINE)) + memcg_offline_kmem(memcg); + if (memcg->kmem_state == KMEM_ALLOCATED) { memcg_destroy_kmem_caches(memcg); static_branch_dec(&memcg_kmem_enabled_key); @@ -2992,7 +2941,11 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) +{ + return 0; +} +static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; } @@ -3007,11 +2960,16 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret; + int ret = 0; mutex_lock(&memcg_limit_mutex); /* Top-level cgroup doesn't propagate from root */ if (!memcg_kmem_online(memcg)) { + if (cgroup_is_populated(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) + ret = -EBUSY; + if (ret) + goto out; ret = memcg_online_kmem(memcg); if (ret) goto out; @@ -4167,90 +4125,44 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static struct mem_cgroup *mem_cgroup_alloc(void) -{ - struct mem_cgroup *memcg; - size_t size; - - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); - if (!memcg) - return NULL; - - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) - goto out_free; - - if (memcg_wb_domain_init(memcg, GFP_KERNEL)) - goto out_free_stat; - - return memcg; - -out_free_stat: - free_percpu(memcg->stat); -out_free: - kfree(memcg); - return NULL; -} - -/* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. - */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void mem_cgroup_free(struct mem_cgroup *memcg) { int node; - cancel_work_sync(&memcg->high_work); - - mem_cgroup_remove_from_trees(memcg); - + memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(memcg->stat); - memcg_wb_domain_exit(memcg); kfree(memcg); } -static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - long error = -ENOMEM; + size_t size; int node; - memcg = mem_cgroup_alloc(); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return NULL; + + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto fail; for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) - goto free_out; + goto fail; - /* root ? */ - if (parent_css == NULL) { - root_mem_cgroup = memcg; - page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; - page_counter_init(&memcg->memsw, NULL); - page_counter_init(&memcg->kmem, NULL); - } + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); @@ -4263,48 +4175,37 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif - return &memcg->css; - -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + return memcg; +fail: + mem_cgroup_free(memcg); + return NULL; } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - int ret; - - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - - if (!parent) - return 0; - - mutex_lock(&memcg_create_mutex); + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); + struct mem_cgroup *memcg; + long error = -ENOMEM; - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; - memcg->swappiness = mem_cgroup_swappiness(parent); + memcg = mem_cgroup_alloc(); + if (!memcg) + return ERR_PTR(error); - if (parent->use_hierarchy) { + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + if (parent) { + memcg->swappiness = mem_cgroup_swappiness(parent); + memcg->oom_kill_disable = parent->oom_kill_disable; + } + if (parent && parent->use_hierarchy) { + memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); - - /* - * No need to take a reference to the parent because cgroup - * core guarantees its existence. - */ } else { page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); @@ -4316,21 +4217,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent != root_mem_cgroup) memory_cgrp_subsys.broken_hierarchy = true; } - mutex_unlock(&memcg_create_mutex); - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; + /* The following stuff does not apply to the root */ + if (!parent) { + root_mem_cgroup = memcg; + return &memcg->css; + } + + error = memcg_propagate_kmem(parent, memcg); + if (error) + goto fail; if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_inc(&memcg_sockets_enabled_key); - /* - * Make sure the memcg is initialized: mem_cgroup_iter() - * orders reading memcg->initialized against its callers - * reading the memcg members. - */ - smp_store_release(&memcg->initialized, 1); + return &memcg->css; +fail: + mem_cgroup_free(memcg); + return NULL; +} + +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + if (css->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; return 0; } @@ -4352,10 +4263,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - vmpressure_cleanup(&memcg->vmpressure); - memcg_offline_kmem(memcg); - wb_memcg_offline(memcg); } @@ -4376,8 +4284,11 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) static_branch_dec(&memcg_sockets_enabled_key); + vmpressure_cleanup(&memcg->vmpressure); + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); memcg_free_kmem(memcg); - __mem_cgroup_free(memcg); + mem_cgroup_free(memcg); } /** -- cgit v0.10.2 From 37e84351198be087335ad2b2253b35c7cc76a5ad Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:56 -0800 Subject: mm: memcontrol: charge swap to cgroup2 This patchset introduces swap accounting to cgroup2. This patch (of 7): In the legacy hierarchy we charge memsw, which is dubious, because: - memsw.limit must be >= memory.limit, so it is impossible to limit swap usage less than memory usage. Taking into account the fact that the primary limiting mechanism in the unified hierarchy is memory.high while memory.limit is either left unset or set to a very large value, moving memsw.limit knob to the unified hierarchy would effectively make it impossible to limit swap usage according to the user preference. - memsw.usage != memory.usage + swap.usage, because a page occupying both swap entry and a swap cache page is charged only once to memsw counter. As a result, it is possible to effectively eat up to memory.limit of memory pages *and* memsw.limit of swap entries, which looks unexpected. That said, we should provide a different swap limiting mechanism for cgroup2. This patch adds mem_cgroup->swap counter, which charges the actual number of swap entries used by a cgroup. It is only charged in the unified hierarchy, while the legacy hierarchy memsw logic is left intact. The swap usage can be monitored using new memory.swap.current file and limited using memory.swap.max. Note, to charge swap resource properly in the unified hierarchy, we have to make swap_entry_free uncharge swap only when ->usage reaches zero, not just ->count, i.e. when all references to a swap entry, including the one taken by swap cache, are gone. This is necessary, because otherwise swap-in could result in uncharging swap even if the page is still in swap cache and hence still occupies a swap entry. At the same time, this shouldn't break memsw counter logic, where a page is never charged twice for using both memory and swap, because in case of legacy hierarchy we uncharge swap on commit (see mem_cgroup_commit_charge). Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 27123e5..6e01262 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -163,6 +163,7 @@ struct mem_cgroup { /* Accounted resources */ struct page_counter memory; + struct page_counter swap; /* Legacy consumer-oriented counters */ struct page_counter memsw; diff --git a/include/linux/swap.h b/include/linux/swap.h index 414e101..83b95f3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -368,11 +368,17 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); +extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { } +static inline int mem_cgroup_try_charge_swap(struct page *page, + swp_entry_t entry) +{ + return 0; +} static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6bc78f..1ff552e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1220,7 +1220,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); @@ -1259,9 +1259,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_limit; + unsigned long swap_limit; memsw_limit = memcg->memsw.limit; - limit = min(limit + total_swap_pages, memsw_limit); + swap_limit = memcg->swap.limit; + swap_limit = min(swap_limit, (unsigned long)total_swap_pages); + limit = min(limit + swap_limit, memsw_limit); } return limit; } @@ -4201,11 +4204,13 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent && parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); + page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); @@ -5224,7 +5229,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, if (page->mem_cgroup) goto out; - if (do_memsw_account()) { + if (do_swap_account) { swp_entry_t ent = { .val = page_private(page), }; unsigned short id = lookup_swap_cgroup_id(ent); @@ -5677,26 +5682,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) memcg_check_events(memcg, page); } +/* + * mem_cgroup_try_charge_swap - try charging a swap entry + * @page: page being added to swap + * @entry: swap entry to charge + * + * Try to charge @entry to the memcg that @page belongs to. + * + * Returns 0 on success, -ENOMEM on failure. + */ +int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + struct page_counter *counter; + unsigned short oldid; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + return 0; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return 0; + + if (!mem_cgroup_is_root(memcg) && + !page_counter_try_charge(&memcg->swap, 1, &counter)) + return -ENOMEM; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + css_get(&memcg->css); + return 0; +} + /** * mem_cgroup_uncharge_swap - uncharge a swap entry * @entry: swap entry to uncharge * - * Drop the memsw charge associated with @entry. + * Drop the swap charge associated with @entry. */ void mem_cgroup_uncharge_swap(swp_entry_t entry) { struct mem_cgroup *memcg; unsigned short id; - if (!do_memsw_account()) + if (!do_swap_account) return; id = swap_cgroup_record(entry, 0); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memsw, 1); + if (!mem_cgroup_is_root(memcg)) { + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->swap, 1); + else + page_counter_uncharge(&memcg->memsw, 1); + } mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } @@ -5720,6 +5765,63 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static u64 swap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; +} + +static int swap_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = READ_ONCE(memcg->swap.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t swap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + mutex_lock(&memcg_limit_mutex); + err = page_counter_limit(&memcg->swap, max); + mutex_unlock(&memcg_limit_mutex); + if (err) + return err; + + return nbytes; +} + +static struct cftype swap_files[] = { + { + .name = "swap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_current_read, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_max_show, + .write = swap_max_write, + }, + { } /* terminate */ +}; + static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", @@ -5751,6 +5853,8 @@ static int __init mem_cgroup_swap_init(void) { if (!mem_cgroup_disabled() && really_do_swap_account) { do_swap_account = 1; + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, + swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } diff --git a/mm/shmem.c b/mm/shmem.c index b98e101..fa2ceb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; + if (mem_cgroup_try_charge_swap(page, swap)) + goto free_swap; + /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); +free_swap: swapcache_free(swap); redirty: set_page_dirty(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 676ff29..69cb246 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list) if (!entry.val) return 0; + if (mem_cgroup_try_charge_swap(page, entry)) { + swapcache_free(entry); + return 0; + } + if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 2bb30aa..22a7a1f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -785,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, count--; } - if (!count) - mem_cgroup_uncharge_swap(entry); - usage = count | has_cache; p->swap_map[offset] = usage; /* free if no reference */ if (!usage) { + mem_cgroup_uncharge_swap(entry); dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; -- cgit v0.10.2 From 3337767850b490eec5ca822f871241c981664737 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:02:59 -0800 Subject: mm: vmscan: pass memcg to get_scan_count() memcg will come in handy in get_scan_count(). It can already be used for getting swappiness immediately in get_scan_count() instead of passing it around. The following patches will add more memcg-related values, which will be used there. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 05dd182..014ff89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1966,10 +1966,11 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, int swappiness, +static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages) { + int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ @@ -2193,9 +2194,10 @@ static inline void init_tlb_ubc(void) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *lru_pages) +static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long *lru_pages) { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2205,7 +2207,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2409,8 +2411,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, unsigned long lru_pages; unsigned long reclaimed; unsigned long scanned; - struct lruvec *lruvec; - int swappiness; if (mem_cgroup_low(root, memcg)) { if (!sc->may_thrash) @@ -2418,12 +2418,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_events(memcg, MEMCG_LOW, 1); } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - swappiness = mem_cgroup_swappiness(memcg); reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + shrink_zone_memcg(zone, memcg, sc, &lru_pages); zone_lru_pages += lru_pages; if (memcg && is_classzone) @@ -2893,8 +2891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, }; - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - int swappiness = mem_cgroup_swappiness(memcg); unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2911,7 +2907,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); + shrink_zone_memcg(zone, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit v0.10.2 From eb01aaab43084f1c919ce66183fea005033351b9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:02 -0800 Subject: mm: memcontrol: replace mem_cgroup_lruvec_online with mem_cgroup_online mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg. Since get_scan_count(), which is the only user of this function, now possesses pointer to memcg, let's pass memcg directly to mem_cgroup_online() instead of picking it out of lruvec and rename the function accordingly. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6e01262..1666617 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -355,6 +355,13 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } +static inline bool mem_cgroup_online(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return true; + return !!(memcg->css.flags & CSS_ONLINE); +} + /* * For memory reclaim. */ @@ -363,20 +370,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages); -static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -{ - struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return true; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - memcg = mz->memcg; - - return !!(memcg->css.flags & CSS_ONLINE); -} - static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { @@ -589,13 +582,13 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline bool -mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } -static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +static inline bool +mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) { return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 014ff89..9a8556e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1997,7 +1997,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, if (current_is_kswapd()) { if (!zone_reclaimable(zone)) force_scan = true; - if (!mem_cgroup_lruvec_online(lruvec)) + if (!mem_cgroup_online(memcg)) force_scan = true; } if (!global_reclaim(sc)) -- cgit v0.10.2 From 6f2cb2f17700a39567cf3e9a2e95041def5f3688 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:05 -0800 Subject: swap.h: move memcg related stuff to the end of the file The following patches will add more functions to the memcg section of include/linux/swap.h. Some of them will need values defined below the current location of the section. So let's move the section to the end of the file. No functional changes intended. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index 83b95f3..c2bd163 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -350,39 +350,7 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); -#ifdef CONFIG_MEMCG -static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) -{ - /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) - return vm_swappiness; - - return memcg->swappiness; -} -#else -static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) -{ - return vm_swappiness; -} -#endif -#ifdef CONFIG_MEMCG_SWAP -extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry); -#else -static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) -{ -} -static inline int mem_cgroup_try_charge_swap(struct page *page, - swp_entry_t entry) -{ - return 0; -} -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) -{ -} -#endif #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); @@ -561,5 +529,43 @@ static inline swp_entry_t get_swap_page(void) } #endif /* CONFIG_SWAP */ + +#ifdef CONFIG_MEMCG +static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) +{ + /* root ? */ + if (mem_cgroup_disabled() || !memcg->css.parent) + return vm_swappiness; + + return memcg->swappiness; +} + +#else +static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) +{ + return vm_swappiness; +} +#endif + +#ifdef CONFIG_MEMCG_SWAP +extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); +extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +#else +static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ +} + +static inline int mem_cgroup_try_charge_swap(struct page *page, + swp_entry_t entry) +{ + return 0; +} + +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ +} +#endif + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ -- cgit v0.10.2 From d8b38438a0bcb362c396f49d8279ef7b505917f4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:07 -0800 Subject: mm: vmscan: do not scan anon pages if memcg swap limit is hit We don't scan anonymous memory if we ran out of swap, neither should we do it in case memcg swap limit is hit, because swap out is impossible anyway. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index c2bd163..a587050 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -551,6 +551,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { @@ -565,6 +566,11 @@ static inline int mem_cgroup_try_charge_swap(struct page *page, static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) { } + +static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + return get_nr_swap_pages(); +} #endif #endif /* __KERNEL__*/ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1ff552e..bcb3871 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5748,6 +5748,19 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_unlock(); } +long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + long nr_swap_pages = get_nr_swap_pages(); + + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return nr_swap_pages; + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + nr_swap_pages = min_t(long, nr_swap_pages, + READ_ONCE(memcg->swap.limit) - + page_counter_read(&memcg->swap)); + return nr_swap_pages; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a8556e..3be5f9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2004,7 +2004,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; goto out; } -- cgit v0.10.2 From 5ccc5abaaf6f9242cc63342c5286990233f392fa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:10 -0800 Subject: mm: free swap cache aggressively if memcg swap is full Swap cache pages are freed aggressively if swap is nearly full (>50% currently), because otherwise we are likely to stop scanning anonymous when we near the swap limit even if there is plenty of freeable swap cache pages. We should follow the same trend in case of memory cgroup, which has its own swap limit. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index a587050..d18b65c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -552,6 +552,7 @@ extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry); extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); +extern bool mem_cgroup_swap_full(struct page *page); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { @@ -571,6 +572,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } + +static inline bool mem_cgroup_swap_full(struct page *page) +{ + return vm_swap_full(); +} #endif #endif /* __KERNEL__*/ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bcb3871..6a00079 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5761,6 +5761,28 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } +bool mem_cgroup_swap_full(struct page *page) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (vm_swap_full()) + return true; + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + memcg = page->mem_cgroup; + if (!memcg) + return false; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + return true; + + return false; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; diff --git a/mm/memory.c b/mm/memory.c index ff17850..30991f8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2582,7 +2582,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (mem_cgroup_swap_full(page) || + (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 22a7a1f..c43f654 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1006,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || mem_cgroup_swap_full(page))) { delete_from_swap_cache(page); SetPageDirty(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 3be5f9d..bd620b6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1214,7 +1214,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && mem_cgroup_swap_full(page)) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); -- cgit v0.10.2 From 3e24b19dd5ff0587674ac7578cc11ef079708327 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 20 Jan 2016 15:03:13 -0800 Subject: Documentation: cgroup: add memory.swap.{current,max} description The rationale of separate swap counter is given by Johannes Weiner. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 31d1f7b..f441564 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -819,6 +819,22 @@ PAGE_SIZE multiple when read back. the cgroup. This may not exactly match the number of processes killed but should generally be close. + memory.swap.current + + A read-only single value file which exists on non-root + cgroups. + + The total amount of swap currently being used by the cgroup + and its descendants. + + memory.swap.max + + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage hard limit. If a cgroup's swap usage reaches this + limit, anonymous meomry of the cgroup will not be swapped out. + 5-2-2. General Usage @@ -1291,3 +1307,20 @@ allocation from the slack available in other groups or the rest of the system than killing the group. Otherwise, memory.max is there to limit this type of spillover and ultimately contain buggy or even malicious applications. + +The combined memory+swap accounting and limiting is replaced by real +control over swap space. + +The main argument for a combined memory+swap facility in the original +cgroup design was that global or parental pressure would always be +able to swap all anonymous memory of a child group, regardless of the +child's own (possibly untrusted) configuration. However, untrusted +groups can sabotage swapping by other means - such as referencing its +anonymous memory in a tight loop - and an admin can not assume full +swappability when overcommitting untrusted jobs. + +For trusted jobs, on the other hand, a combined counter is not an +intuitive userspace interface, and it flies in the face of the idea +that cgroup controllers should account and limit specific physical +resources. Swap space is a resource like all others in the system, +and that's why unified hierarchy allows distributing it separately. -- cgit v0.10.2 From 44b7a8d33d666268062e0f725d5f14813a63a6ea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:16 -0800 Subject: mm: memcontrol: do not uncharge old page in page cache replacement Changing page->mem_cgroup of a live page is tricky and fragile. In particular, the memcg writeback code relies on that mapping being stable and users of mem_cgroup_replace_page() not overlapping with dirtyable inodes. Page cache replacement doesn't have to do that, though. Instead of being clever and transferring the charge from the old page to the new, force-charge the new page and leave the old page alone. A temporary overcharge won't matter in practice, and the old page is going to be freed shortly after this anyway. And this is not performance critical. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a00079..bf35bff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -366,13 +366,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. - * - * XXX: The above description of behavior on the default hierarchy isn't - * strictly true yet as replace_page_cache_page() can modify the - * association before @page is released even on the default hierarchy; - * however, the current and planned usages don't mix the the two functions - * and replace_page_cache_page() will soon be updated to make the invariant - * actually true. */ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { @@ -5464,7 +5457,8 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; - int isolated; + unsigned int nr_pages; + bool compound; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5484,11 +5478,21 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) if (!memcg) return; - lock_page_lru(oldpage, &isolated); - oldpage->mem_cgroup = NULL; - unlock_page_lru(oldpage, isolated); + /* Force-charge the new page. The old one will be freed soon */ + compound = PageTransHuge(newpage); + nr_pages = compound ? hpage_nr_pages(newpage) : 1; + + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); commit_charge(newpage, memcg, true); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + memcg_check_events(memcg, newpage); + local_irq_enable(); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); -- cgit v0.10.2 From 587d9f726aaec52157e4156e50363dbe6cb82bdb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:19 -0800 Subject: mm: memcontrol: basic memory statistics in cgroup2 memory controller Provide a cgroup2 memory.stat that provides statistics on LRU memory and fault event counters. More consumers and breakdowns will follow. Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index f441564..65b3eac 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -819,6 +819,62 @@ PAGE_SIZE multiple when read back. the cgroup. This may not exactly match the number of processes killed but should generally be close. + memory.stat + + A read-only flat-keyed file which exists on non-root cgroups. + + This breaks down the cgroup's memory footprint into different + types of memory, type-specific details, and other information + on the state and past events of the memory management system. + + All memory amounts are in bytes. + + The entries are ordered to be human readable, and new entries + can show up in the middle. Don't rely on items remaining in a + fixed position; use the keys to look up specific values! + + anon + + Amount of memory used in anonymous mappings such as + brk(), sbrk(), and mmap(MAP_ANONYMOUS) + + file + + Amount of memory used to cache filesystem data, + including tmpfs and shared memory. + + file_mapped + + Amount of cached filesystem data mapped with mmap() + + file_dirty + + Amount of cached filesystem data that was modified but + not yet written back to disk + + file_writeback + + Amount of cached filesystem data that was modified and + is currently being written back to disk + + inactive_anon + active_anon + inactive_file + active_file + unevictable + + Amount of memory, swap-backed and filesystem-backed, + on the internal memory management lists used by the + page reclaim algorithm + + pgfault + + Total number of page faults incurred + + pgmajfault + + Number of major page faults incurred + memory.swap.current A read-only single value file which exists on non-root diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bf35bff..98f4109 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2767,6 +2767,18 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } +static unsigned long tree_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +{ + struct mem_cgroup *iter; + unsigned long val = 0; + + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_events(iter, idx); + + return val; +} + static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -5096,6 +5108,57 @@ static int memory_events_show(struct seq_file *m, void *v) return 0; } +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + int i; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_printf(m, "anon %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + seq_printf(m, "file %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + + seq_printf(m, "file_mapped %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * + PAGE_SIZE); + seq_printf(m, "file_dirty %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * + PAGE_SIZE); + seq_printf(m, "file_writeback %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) { + struct mem_cgroup *mi; + unsigned long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)); + seq_printf(m, "%s %llu\n", + mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); + } + + /* Accumulated memory events */ + + seq_printf(m, "pgfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + + return 0; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5126,6 +5189,11 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_stat_show, + }, { } /* terminate */ }; -- cgit v0.10.2 From b2807f07f4f87362925b8a5b8cbb7b624da10f03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Jan 2016 15:03:22 -0800 Subject: mm: memcontrol: add "sock" to cgroup2 memory.stat Provide statistics on how much of a cgroup's memory footprint is made up of socket buffers from network connections owned by the group. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1666617..9ae48d4 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -50,6 +50,9 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, + /* default hierarchy stats */ + MEMCG_SOCK, + MEMCG_NR_STAT, }; struct mem_cgroup_reclaim_cookie { @@ -87,7 +90,7 @@ enum mem_cgroup_events_target { #ifdef CONFIG_MEMCG struct mem_cgroup_stat_cpu { - long count[MEM_CGROUP_STAT_NSTATS]; + long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 98f4109..ca052f2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5128,6 +5128,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + seq_printf(m, "sock %llu\n", + (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * @@ -5631,6 +5633,8 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; + this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5650,6 +5654,8 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } + this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); css_put_many(&memcg->css, nr_pages); } -- cgit v0.10.2 From 9f273c24ec5f4a6f785bb83e931b3808a07b459e Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 20 Jan 2016 15:03:25 -0800 Subject: MAINTAINERS: add/fix git URLs for various subsystems Add/fix git URLs for various subsystems Add git URL for at91 Add git URL for Rockchip Add git URL for ARM64 Update git URL for ath6kl Add git URL for backlight Add git URL for chrome Add git URL for cris Add git URL for cryptodev Update git URL for DLM Add git URL for eCryptfs Add git URL for ext4 Add git URL for hwspinlock Add git URL for integrity Add git URL for IPVS Add git URL for nfsd Add git URL for KVM/s390 Add git URL for kgdb Add git URL for nvdimm Add git URL for metag Add git URL for wireless drivers Add git URL for devicetree Update git URL for PCMCIA Update git URL for pstore Update git URL for ath10k Add git URL for hexagon Add git URL for reset Add git URL for s390 Fix tree format for SAMSUNG thermal Add git URL for md Add git URL for squashfs Add git URL for swiotlb Add git URL for xtensa Fix tree format for TPM Add git URL for UML Add git URL for VFIO Add git URL for vhost Update git URL for XFS Fix MIC maintainers entry Signed-off-by: Fengguang Wu Acked-by: Alexandre Belloni Acked-by: Nicolas Ferre Acked-by: Catalin Marinas Acked-by: Kalle Valo Acked-by: Cornelia Huck Acked-by: James Hogan Acked-by: Dominik Brodowski Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 66662b8..c161bdc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -781,6 +781,7 @@ F: sound/aoa/ APM DRIVER M: Jiri Kosina S: Odd fixes +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jikos/apm.git F: arch/x86/kernel/apm_32.c F: include/linux/apm_bios.h F: include/uapi/linux/apm_bios.h @@ -946,6 +947,7 @@ M: Alexandre Belloni M: Jean-Christophe Plagniol-Villard L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.linux4sam.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/nferre/linux-at91.git S: Supported F: arch/arm/mach-at91/ F: include/soc/at91/ @@ -1454,6 +1456,7 @@ ARM/Rockchip SoC support M: Heiko Stuebner L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-rockchip@lists.infradead.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip.git S: Maintained F: arch/arm/boot/dts/rk3* F: arch/arm/mach-rockchip/ @@ -1778,6 +1781,7 @@ ARM64 PORT (AARCH64 ARCHITECTURE) M: Catalin Marinas M: Will Deacon L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +T: git git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git S: Maintained F: arch/arm64/ F: Documentation/arm64/ @@ -1863,7 +1867,7 @@ ATHEROS ATH6KL WIRELESS DRIVER M: Kalle Valo L: linux-wireless@vger.kernel.org W: http://wireless.kernel.org/en/users/Drivers/ath6kl -T: git git://github.com/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git S: Supported F: drivers/net/wireless/ath/ath6kl/ @@ -2115,6 +2119,7 @@ F: drivers/net/wireless/broadcom/b43legacy/ BACKLIGHT CLASS/SUBSYSTEM M: Jingoo Han M: Lee Jones +T: git git://git.kernel.org/pub/scm/linux/kernel/git/lee/backlight.git S: Maintained F: drivers/video/backlight/ F: include/linux/backlight.h @@ -2796,6 +2801,7 @@ F: drivers/input/touchscreen/chipone_icn8318.c CHROME HARDWARE PLATFORM SUPPORT M: Olof Johansson S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/olof/chrome-platform.git F: drivers/platform/chrome/ CISCO VIC ETHERNET NIC DRIVER @@ -3094,6 +3100,7 @@ M: Mikael Starvik M: Jesper Nilsson L: linux-cris-kernel@axis.com W: http://developer.axis.com +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jesper/cris.git S: Maintained F: arch/cris/ F: drivers/tty/serial/crisv10.* @@ -3102,6 +3109,7 @@ CRYPTO API M: Herbert Xu M: "David S. Miller" L: linux-crypto@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git S: Maintained F: Documentation/crypto/ @@ -3551,7 +3559,7 @@ M: Christine Caulfield M: David Teigland L: cluster-devel@redhat.com W: http://sources.redhat.com/cluster/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/teigland/dlm.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm.git S: Supported F: fs/dlm/ @@ -3965,6 +3973,7 @@ M: Tyler Hicks L: ecryptfs@vger.kernel.org W: http://ecryptfs.org W: https://launchpad.net/ecryptfs +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git S: Supported F: Documentation/filesystems/ecryptfs.txt F: fs/ecryptfs/ @@ -4243,6 +4252,7 @@ M: Andreas Dilger L: linux-ext4@vger.kernel.org W: http://ext4.wiki.kernel.org Q: http://patchwork.ozlabs.org/project/linux-ext4/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git S: Maintained F: Documentation/filesystems/ext4.txt F: fs/ext4/ @@ -4925,6 +4935,7 @@ F: include/linux/hw_random.h HARDWARE SPINLOCK CORE M: Ohad Ben-Cohen S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/ohad/hwspinlock.git F: Documentation/hwspinlock.txt F: drivers/hwspinlock/hwspinlock_* F: include/linux/hwspinlock.h @@ -5463,6 +5474,7 @@ M: Dmitry Kasatkin L: linux-ima-devel@lists.sourceforge.net L: linux-ima-user@lists.sourceforge.net L: linux-security-module@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git S: Supported F: security/integrity/ima/ @@ -5718,11 +5730,11 @@ F: include/linux/mic_bus.h F: include/linux/scif.h F: include/uapi/linux/mic_common.h F: include/uapi/linux/mic_ioctl.h -F include/uapi/linux/scif_ioctl.h +F: include/uapi/linux/scif_ioctl.h F: drivers/misc/mic/ F: drivers/dma/mic_x100_dma.c F: drivers/dma/mic_x100_dma.h -F Documentation/mic/ +F: Documentation/mic/ INTEL PMC/P-Unit IPC DRIVER M: Zha Qipeng @@ -5803,6 +5815,8 @@ M: Julian Anastasov L: netdev@vger.kernel.org L: lvs-devel@vger.kernel.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git F: Documentation/networking/ipvs-sysctl.txt F: include/net/ip_vs.h F: include/uapi/linux/ip_vs.h @@ -6086,6 +6100,7 @@ M: "J. Bruce Fields" M: Jeff Layton L: linux-nfs@vger.kernel.org W: http://nfs.sourceforge.net/ +T: git git://linux-nfs.org/~bfields/linux.git S: Supported F: fs/nfsd/ F: include/uapi/linux/nfsd/ @@ -6142,6 +6157,7 @@ M: Christian Borntraeger M: Cornelia Huck L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git S: Supported F: Documentation/s390/kvm.txt F: arch/s390/include/asm/kvm* @@ -6215,6 +6231,7 @@ KGDB / KDB /debug_core M: Jason Wessel W: http://kgdb.wiki.kernel.org/ L: kgdb-bugreport@lists.sourceforge.net +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jwessel/kgdb.git S: Maintained F: Documentation/DocBook/kgdb.tmpl F: drivers/misc/kgdbts.c @@ -6386,6 +6403,7 @@ LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM M: Dan Williams L: linux-nvdimm@lists.01.org Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git S: Supported F: drivers/nvdimm/* F: include/linux/nd.h @@ -7055,6 +7073,7 @@ F: Documentation/hwmon/menf21bmc METAG ARCHITECTURE M: James Hogan L: linux-metag@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jhogan/metag.git S: Odd Fixes F: arch/metag/ F: Documentation/metag/ @@ -7536,7 +7555,8 @@ NETWORKING DRIVERS (WIRELESS) M: Kalle Valo L: linux-wireless@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-wireless/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git S: Maintained F: drivers/net/wireless/ @@ -7942,6 +7962,7 @@ M: Mark Rutland M: Ian Campbell M: Kumar Gala L: devicetree@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git S: Maintained F: Documentation/devicetree/ F: arch/*/boot/dts/ @@ -8318,7 +8339,7 @@ PCMCIA SUBSYSTEM P: Linux PCMCIA Team L: linux-pcmcia@lists.infradead.org W: http://lists.infradead.org/mailman/listinfo/linux-pcmcia -T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia.git S: Maintained F: Documentation/pcmcia/ F: drivers/pcmcia/ @@ -8640,7 +8661,7 @@ M: Colin Cross M: Kees Cook M: Tony Luck S: Maintained -T: git git://git.infradead.org/users/cbou/linux-pstore.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git F: fs/pstore/ F: include/linux/pstore* F: drivers/firmware/efi/efi-pstore.c @@ -8849,13 +8870,14 @@ QUALCOMM ATHEROS ATH10K WIRELESS DRIVER M: Kalle Valo L: ath10k@lists.infradead.org W: http://wireless.kernel.org/en/users/Drivers/ath10k -T: git git://github.com/kvalo/ath.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git S: Supported F: drivers/net/wireless/ath/ath10k/ QUALCOMM HEXAGON ARCHITECTURE M: Richard Kuo L: linux-hexagon@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rkuo/linux-hexagon-kernel.git S: Supported F: arch/hexagon/ @@ -9054,6 +9076,7 @@ F: drivers/phy/phy-rcar-gen3-usb2.c RESET CONTROLLER FRAMEWORK M: Philipp Zabel +T: git git://git.pengutronix.de/git/pza/linux S: Maintained F: drivers/reset/ F: Documentation/devicetree/bindings/reset/ @@ -9201,6 +9224,7 @@ M: Martin Schwidefsky M: Heiko Carstens L: linux-s390@vger.kernel.org W: http://www.ibm.com/developerworks/linux/linux390/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git S: Supported F: arch/s390/ F: drivers/s390/ @@ -9393,7 +9417,7 @@ M: Lukasz Majewski L: linux-pm@vger.kernel.org L: linux-samsung-soc@vger.kernel.org S: Supported -T: https://github.com/lmajewski/linux-samsung-thermal.git +T: git https://github.com/lmajewski/linux-samsung-thermal.git F: drivers/thermal/samsung/ SAMSUNG USB2 PHY DRIVER @@ -10046,6 +10070,7 @@ F: drivers/media/pci/solo6x10/ SOFTWARE RAID (Multiple Disks) SUPPORT L: linux-raid@vger.kernel.org +T: git git://neil.brown.name/md S: Supported F: drivers/md/ F: include/linux/raid/ @@ -10217,6 +10242,7 @@ SQUASHFS FILE SYSTEM M: Phillip Lougher L: squashfs-devel@lists.sourceforge.net (subscribers-only) W: http://squashfs.org.uk +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pkl/squashfs-next.git S: Maintained F: Documentation/filesystems/squashfs.txt F: fs/squashfs/ @@ -10413,6 +10439,7 @@ F: arch/x86/boot/video* SWIOTLB SUBSYSTEM M: Konrad Rzeszutek Wilk L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/konrad/swiotlb.git S: Supported F: lib/swiotlb.c F: arch/*/kernel/pci-swiotlb.c @@ -10676,6 +10703,7 @@ TENSILICA XTENSA PORT (xtensa) M: Chris Zankel M: Max Filippov L: linux-xtensa@linux-xtensa.org +T: git git://github.com/czankel/xtensa-linux.git S: Maintained F: arch/xtensa/ F: drivers/irqchip/irq-xtensa-* @@ -10958,7 +10986,7 @@ R: Jason Gunthorpe W: http://tpmdd.sourceforge.net L: tpmdd-devel@lists.sourceforge.net (moderated for non-subscribers) Q: git git://github.com/PeterHuewe/linux-tpmdd.git -T: https://github.com/PeterHuewe/linux-tpmdd +T: git https://github.com/PeterHuewe/linux-tpmdd S: Maintained F: drivers/char/tpm/ @@ -11415,6 +11443,7 @@ M: Richard Weinberger L: user-mode-linux-devel@lists.sourceforge.net L: user-mode-linux-user@lists.sourceforge.net W: http://user-mode-linux.sourceforge.net +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml.git S: Maintained F: Documentation/virtual/uml/ F: arch/um/ @@ -11461,6 +11490,7 @@ F: fs/fat/ VFIO DRIVER M: Alex Williamson L: kvm@vger.kernel.org +T: git git://github.com/awilliam/linux-vfio.git S: Maintained F: Documentation/vfio.txt F: drivers/vfio/ @@ -11530,6 +11560,7 @@ M: "Michael S. Tsirkin" L: kvm@vger.kernel.org L: virtualization@lists.linux-foundation.org L: netdev@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git S: Maintained F: drivers/vhost/ F: include/uapi/linux/vhost.h @@ -11946,7 +11977,7 @@ M: Dave Chinner M: xfs@oss.sgi.com L: xfs@oss.sgi.com W: http://oss.sgi.com/projects/xfs -T: git git://oss.sgi.com/xfs/xfs.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git S: Supported F: Documentation/filesystems/xfs.txt F: fs/xfs/ -- cgit v0.10.2