From 71a8986d7e4845b6fca1298fe6e3233ce6fde0b7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 18 Jul 2013 16:50:59 -0400 Subject: ARM: suspend: use hash of cpu_logical_map value to index into save array Currently we hash the MPIDR of the CPU being suspended to determine which entry in the sleep_save_sp array to use. In some situations, such as when we want to resume on another physical CPU, the MPIDR of another CPU should be used instead. So let's use the value of cpu_logical_map(smp_processor_id()) in place of the MPIDR in the suspend path. This will result in the same index being used as with the previous code unless the caller has modified cpu_logical_map() beforehand with the MPIDR of the physical CPU the suspending logical CPU will resume on. Consequently, if doing a physical CPU migration, cpu_logical_map() must be updated appropriately somewhere between cpu_pm_enter() and cpu_suspend(). The register allocation in __cpu_suspend is reworked in order to better accommodate the additional argument. Signed-off-by: Nicolas Pitre Acked-by: Lorenzo Pieralisi Reviewed-by: Dave Martin diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index db1536b..6224602 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S @@ -55,6 +55,7 @@ * specific registers and some other data for resume. * r0 = suspend function arg0 * r1 = suspend function + * r2 = MPIDR value the resuming CPU will use */ ENTRY(__cpu_suspend) stmfd sp!, {r4 - r11, lr} @@ -67,23 +68,18 @@ ENTRY(__cpu_suspend) mov r5, sp @ current virtual SP add r4, r4, #12 @ Space for pgd, virt sp, phys resume fn sub sp, sp, r4 @ allocate CPU state on stack - stmfd sp!, {r0, r1} @ save suspend func arg and pointer - add r0, sp, #8 @ save pointer to save block - mov r1, r4 @ size of save block - mov r2, r5 @ virtual SP ldr r3, =sleep_save_sp + stmfd sp!, {r0, r1} @ save suspend func arg and pointer ldr r3, [r3, #SLEEP_SAVE_SP_VIRT] - ALT_SMP(mrc p15, 0, r9, c0, c0, 5) - ALT_UP_B(1f) - ldr r8, =mpidr_hash - /* - * This ldmia relies on the memory layout of the mpidr_hash - * struct mpidr_hash. - */ - ldmia r8, {r4-r7} @ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts - compute_mpidr_hash lr, r5, r6, r7, r9, r4 - add r3, r3, lr, lsl #2 -1: + ALT_SMP(ldr r0, =mpidr_hash) + ALT_UP_B(1f) + /* This ldmia relies on the memory layout of the mpidr_hash struct */ + ldmia r0, {r1, r6-r8} @ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts + compute_mpidr_hash r0, r6, r7, r8, r2, r1 + add r3, r3, r0, lsl #2 +1: mov r2, r5 @ virtual SP + mov r1, r4 @ size of save block + add r0, sp, #8 @ pointer to save block bl __cpu_suspend_save adr lr, BSYM(cpu_suspend_abort) ldmfd sp!, {r0, pc} @ call suspend fn diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index 41cf3cb..2835d35 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -10,7 +10,7 @@ #include #include -extern int __cpu_suspend(unsigned long, int (*)(unsigned long)); +extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid); extern void cpu_resume_mmu(void); #ifdef CONFIG_MMU @@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void); int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { struct mm_struct *mm = current->active_mm; + u32 __mpidr = cpu_logical_map(smp_processor_id()); int ret; if (!idmap_pgd) @@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) * resume (indicated by a zero return code), we need to switch * back to the correct page tables. */ - ret = __cpu_suspend(arg, fn); + ret = __cpu_suspend(arg, fn, __mpidr); if (ret == 0) { cpu_switch_mm(mm->pgd, mm); local_flush_bp_all(); @@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) #else int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { - return __cpu_suspend(arg, fn); + u32 __mpidr = cpu_logical_map(smp_processor_id()); + return __cpu_suspend(arg, fn, __mpidr); } #define idmap_pgd NULL #endif -- cgit v0.10.2 From 1a6b69b6548cd0dd82549393f30dd982ceeb79d2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 12 Apr 2012 01:40:31 -0400 Subject: ARM: gic: add CPU migration support This is required by the big.LITTLE switcher code. The gic_migrate_target() changes the CPU interface mapping for the current CPU to redirect SGIs to the specified interface, and it also updates the target CPU for each interrupts to that CPU interface if they were targeting the current interface. Finally, pending SGIs for the current CPU are forwarded to the new interface. Because Linux does not use it, the SGI source information for the forwarded SGIs is not preserved. Neither is the source information for the SGIs sent by the current CPU to other CPUs adjusted to match the new CPU interface mapping. The required registers are banked so only the target CPU could do it. Signed-off-by: Nicolas Pitre diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index ee7c503..268874a 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -253,10 +253,9 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val, if (cpu >= NR_GIC_CPU_IF || cpu >= nr_cpu_ids) return -EINVAL; + raw_spin_lock(&irq_controller_lock); mask = 0xff << shift; bit = gic_cpu_map[cpu] << shift; - - raw_spin_lock(&irq_controller_lock); val = readl_relaxed(reg) & ~mask; writel_relaxed(val | bit, reg); raw_spin_unlock(&irq_controller_lock); @@ -646,7 +645,9 @@ static void __init gic_pm_init(struct gic_chip_data *gic) void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) { int cpu; - unsigned long map = 0; + unsigned long flags, map = 0; + + raw_spin_lock_irqsave(&irq_controller_lock, flags); /* Convert our logical CPU mask into a physical one. */ for_each_cpu(cpu, mask) @@ -660,6 +661,86 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) /* this always happens on GIC0 */ writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); + + raw_spin_unlock_irqrestore(&irq_controller_lock, flags); +} +#endif + +#ifdef CONFIG_BL_SWITCHER +/* + * gic_migrate_target - migrate IRQs to another CPU interface + * + * @new_cpu_id: the CPU target ID to migrate IRQs to + * + * Migrate all peripheral interrupts with a target matching the current CPU + * to the interface corresponding to @new_cpu_id. The CPU interface mapping + * is also updated. Targets to other CPU interfaces are unchanged. + * This must be called with IRQs locally disabled. + */ +void gic_migrate_target(unsigned int new_cpu_id) +{ + unsigned int cur_cpu_id, gic_irqs, gic_nr = 0; + void __iomem *dist_base; + int i, ror_val, cpu = smp_processor_id(); + u32 val, cur_target_mask, active_mask; + + if (gic_nr >= MAX_GIC_NR) + BUG(); + + dist_base = gic_data_dist_base(&gic_data[gic_nr]); + if (!dist_base) + return; + gic_irqs = gic_data[gic_nr].gic_irqs; + + cur_cpu_id = __ffs(gic_cpu_map[cpu]); + cur_target_mask = 0x01010101 << cur_cpu_id; + ror_val = (cur_cpu_id - new_cpu_id) & 31; + + raw_spin_lock(&irq_controller_lock); + + /* Update the target interface for this logical CPU */ + gic_cpu_map[cpu] = 1 << new_cpu_id; + + /* + * Find all the peripheral interrupts targetting the current + * CPU interface and migrate them to the new CPU interface. + * We skip DIST_TARGET 0 to 7 as they are read-only. + */ + for (i = 8; i < DIV_ROUND_UP(gic_irqs, 4); i++) { + val = readl_relaxed(dist_base + GIC_DIST_TARGET + i * 4); + active_mask = val & cur_target_mask; + if (active_mask) { + val &= ~active_mask; + val |= ror32(active_mask, ror_val); + writel_relaxed(val, dist_base + GIC_DIST_TARGET + i*4); + } + } + + raw_spin_unlock(&irq_controller_lock); + + /* + * Now let's migrate and clear any potential SGIs that might be + * pending for us (cur_cpu_id). Since GIC_DIST_SGI_PENDING_SET + * is a banked register, we can only forward the SGI using + * GIC_DIST_SOFTINT. The original SGI source is lost but Linux + * doesn't use that information anyway. + * + * For the same reason we do not adjust SGI source information + * for previously sent SGIs by us to other CPUs either. + */ + for (i = 0; i < 16; i += 4) { + int j; + val = readl_relaxed(dist_base + GIC_DIST_SGI_PENDING_SET + i); + if (!val) + continue; + writel_relaxed(val, dist_base + GIC_DIST_SGI_PENDING_CLEAR + i); + for (j = i; j < i + 4; j++) { + if (val & 0xff) + writel_relaxed((1 << (new_cpu_id + 16)) | j, + dist_base + GIC_DIST_SOFTINT); + val >>= 8; + } + } } #endif diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 3e203eb..40bfcac 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -31,6 +31,8 @@ #define GIC_DIST_TARGET 0x800 #define GIC_DIST_CONFIG 0xc00 #define GIC_DIST_SOFTINT 0xf00 +#define GIC_DIST_SGI_PENDING_CLEAR 0xf10 +#define GIC_DIST_SGI_PENDING_SET 0xf20 #define GICH_HCR 0x0 #define GICH_VTR 0x4 @@ -73,6 +75,8 @@ static inline void gic_init(unsigned int nr, int start, gic_init_bases(nr, start, dist, cpu, 0, NULL); } +void gic_migrate_target(unsigned int new_cpu_id); + #endif /* __ASSEMBLY */ #endif -- cgit v0.10.2 From 1c33be57496d927ce05b2513ff0c108f69db4345 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 12 Apr 2012 02:56:10 -0400 Subject: ARM: b.L: core switcher code This is the core code implementing big.LITTLE switcher functionality. Rationale for this code is available here: http://lwn.net/Articles/481055/ The main entry point for a switch request is: void bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) If the calling CPU is not the wanted one, this wrapper takes care of sending the request to the appropriate CPU with schedule_work_on(). At the moment the core switch operation is handled by bL_switch_to() which must be called on the CPU for which a switch is requested. What this code does: * Return early if the current cluster is the wanted one. * Close the gate in the kernel entry vector for both the inbound and outbound CPUs. * Wake up the inbound CPU so it can perform its reset sequence in parallel up to the kernel entry vector gate. * Migrate all interrupts in the GIC targeting the outbound CPU interface to the inbound CPU interface, including SGIs. This is performed by gic_migrate_target() in drivers/irqchip/irq-gic.c. * Call cpu_pm_enter() which takes care of flushing the VFP state to RAM and save the CPU interface config from the GIC to RAM. * Modify the cpu_logical_map to refer to the inbound physical CPU. * Call cpu_suspend() which saves the CPU state (general purpose registers, page table address) onto the stack and store the resulting stack pointer in an array indexed by the updated cpu_logical_map, then call the provided shutdown function. This happens in arch/arm/kernel/sleep.S. At this point, the provided shutdown function executed by the outbound CPU ungates the inbound CPU. Therefore the inbound CPU: * Picks up the saved stack pointer in the array indexed by its MPIDR in arch/arm/kernel/sleep.S. * The MMU and caches are re-enabled using the saved state on the provided stack, just like if this was a resume operation from a suspended state. * Then cpu_suspend() returns, although this is on the inbound CPU rather than the outbound CPU which called it initially. * The function cpu_pm_exit() is called which effect is to restore the CPU interface state in the GIC using the state previously saved by the outbound CPU. * Exit of bL_switch_to() to resume normal kernel execution on the new CPU. However, the outbound CPU is potentially still running in parallel while the inbound CPU is resuming normal kernel execution, hence we need per CPU stack isolation to execute bL_do_switch(). After the outbound CPU has ungated the inbound CPU, it calls mcpm_cpu_power_down() to: * Clean its L1 cache. * If it is the last CPU still alive in its cluster (last man standing), it also cleans its L2 cache and disables cache snooping from the other cluster. * Power down the CPU (or whole cluster). Code called from bL_do_switch() might end up referencing 'current' for some reasons. However, 'current' is derived from the stack pointer. With any arbitrary stack, the returned value for 'current' and any dereferenced values through it are just random garbage which may lead to segmentation faults. The active page table during the execution of bL_do_switch() is also a problem. There is no guarantee that the inbound CPU won't destroy the corresponding task which would free the attached page table while the outbound CPU is still running and relying on it. To solve both issues, we borrow some of the task space belonging to the init/idle task which, by its nature, is lightly used and therefore is unlikely to clash with our usage. The init task is also never going away. Right now the logical CPU number is assumed to be equivalent to the physical CPU number within each cluster. The kernel should also be booted with only one cluster active. These limitations will be lifted eventually. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 37c0f4e..ade7001 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1538,6 +1538,24 @@ config MCPM for (multi-)cluster based systems, such as big.LITTLE based systems. +config BIG_LITTLE + bool "big.LITTLE support (Experimental)" + depends on CPU_V7 && SMP + select MCPM + help + This option enables support selections for the big.LITTLE + system architecture. + +config BL_SWITCHER + bool "big.LITTLE switcher support" + depends on BIG_LITTLE && MCPM && HOTPLUG_CPU + select CPU_PM + select ARM_CPU_SUSPEND + help + The big.LITTLE "switcher" provides the core functionality to + transparently handle transition between a cluster of A15's + and a cluster of A7's in a big.LITTLE system. + choice prompt "Memory split" default VMSPLIT_3G diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile index 8c60f47..2586240 100644 --- a/arch/arm/common/Makefile +++ b/arch/arm/common/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o AFLAGS_mcpm_head.o := -march=armv7-a AFLAGS_vlock.o := -march=armv7-a obj-$(CONFIG_TI_PRIV_EDMA) += edma.o +obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c new file mode 100644 index 0000000..f8f2e96 --- /dev/null +++ b/arch/arm/common/bL_switcher.c @@ -0,0 +1,241 @@ +/* + * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver + * + * Created by: Nicolas Pitre, March 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +/* + * Use our own MPIDR accessors as the generic ones in asm/cputype.h have + * __attribute_const__ and we don't want the compiler to assume any + * constness here as the value _does_ change along some code paths. + */ + +static int read_mpidr(void) +{ + unsigned int id; + asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id)); + return id & MPIDR_HWID_BITMASK; +} + +/* + * bL switcher core code. + */ + +static void bL_do_switch(void *_unused) +{ + unsigned mpidr, cpuid, clusterid, ob_cluster, ib_cluster; + + /* + * We now have a piece of stack borrowed from the init task's. + * Let's also switch to init_mm right away to match it. + */ + cpu_switch_mm(init_mm.pgd, &init_mm); + + pr_debug("%s\n", __func__); + + mpidr = read_mpidr(); + cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); + clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); + ob_cluster = clusterid; + ib_cluster = clusterid ^ 1; + + /* + * Our state has been saved at this point. Let's release our + * inbound CPU. + */ + mcpm_set_entry_vector(cpuid, ib_cluster, cpu_resume); + sev(); + + /* + * From this point, we must assume that our counterpart CPU might + * have taken over in its parallel world already, as if execution + * just returned from cpu_suspend(). It is therefore important to + * be very careful not to make any change the other guy is not + * expecting. This is why we need stack isolation. + * + * Fancy under cover tasks could be performed here. For now + * we have none. + */ + + /* Let's put ourself down. */ + mcpm_cpu_power_down(); + + /* should never get here */ + BUG(); +} + +/* + * Stack isolation. To ensure 'current' remains valid, we just borrow + * a slice of the init/idle task which should be fairly lightly used. + * The borrowed area starts just above the thread_info structure located + * at the very bottom of the stack, aligned to a cache line. + */ +#define STACK_SIZE 256 +extern void call_with_stack(void (*fn)(void *), void *arg, void *sp); +static int bL_switchpoint(unsigned long _arg) +{ + unsigned int mpidr = read_mpidr(); + unsigned int cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); + unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); + unsigned int cpu_index = cpuid + clusterid * MAX_CPUS_PER_CLUSTER; + void *stack = &init_thread_info + 1; + stack = PTR_ALIGN(stack, L1_CACHE_BYTES); + stack += cpu_index * STACK_SIZE + STACK_SIZE; + call_with_stack(bL_do_switch, (void *)_arg, stack); + BUG(); +} + +/* + * Generic switcher interface + */ + +/* + * bL_switch_to - Switch to a specific cluster for the current CPU + * @new_cluster_id: the ID of the cluster to switch to. + * + * This function must be called on the CPU to be switched. + * Returns 0 on success, else a negative status code. + */ +static int bL_switch_to(unsigned int new_cluster_id) +{ + unsigned int mpidr, cpuid, clusterid, ob_cluster, ib_cluster, this_cpu; + int ret; + + mpidr = read_mpidr(); + cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); + clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); + ob_cluster = clusterid; + ib_cluster = clusterid ^ 1; + + if (new_cluster_id == clusterid) + return 0; + + pr_debug("before switch: CPU %d in cluster %d\n", cpuid, clusterid); + + /* Close the gate for our entry vectors */ + mcpm_set_entry_vector(cpuid, ob_cluster, NULL); + mcpm_set_entry_vector(cpuid, ib_cluster, NULL); + + /* + * Let's wake up the inbound CPU now in case it requires some delay + * to come online, but leave it gated in our entry vector code. + */ + ret = mcpm_cpu_power_up(cpuid, ib_cluster); + if (ret) { + pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret); + return ret; + } + + /* + * From this point we are entering the switch critical zone + * and can't take any interrupts anymore. + */ + local_irq_disable(); + local_fiq_disable(); + + this_cpu = smp_processor_id(); + + /* redirect GIC's SGIs to our counterpart */ + gic_migrate_target(cpuid + ib_cluster*4); + + /* + * Raise a SGI on the inbound CPU to make sure it doesn't stall + * in a possible WFI, such as in mcpm_power_down(). + */ + arch_send_wakeup_ipi_mask(cpumask_of(this_cpu)); + + ret = cpu_pm_enter(); + + /* we can not tolerate errors at this point */ + if (ret) + panic("%s: cpu_pm_enter() returned %d\n", __func__, ret); + + /* Flip the cluster in the CPU logical map for this CPU. */ + cpu_logical_map(this_cpu) ^= (1 << 8); + + /* Let's do the actual CPU switch. */ + ret = cpu_suspend(0, bL_switchpoint); + if (ret > 0) + panic("%s: cpu_suspend() returned %d\n", __func__, ret); + + /* We are executing on the inbound CPU at this point */ + mpidr = read_mpidr(); + cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); + clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); + pr_debug("after switch: CPU %d in cluster %d\n", cpuid, clusterid); + BUG_ON(clusterid != ib_cluster); + + mcpm_cpu_powered_up(); + + ret = cpu_pm_exit(); + + local_fiq_enable(); + local_irq_enable(); + + if (ret) + pr_err("%s exiting with error %d\n", __func__, ret); + return ret; +} + +struct switch_args { + unsigned int cluster; + struct work_struct work; +}; + +static void __bL_switch_to(struct work_struct *work) +{ + struct switch_args *args = container_of(work, struct switch_args, work); + bL_switch_to(args->cluster); +} + +/* + * bL_switch_request - Switch to a specific cluster for the given CPU + * + * @cpu: the CPU to switch + * @new_cluster_id: the ID of the cluster to switch to. + * + * This function causes a cluster switch on the given CPU. If the given + * CPU is the same as the calling CPU then the switch happens right away. + * Otherwise the request is put on a work queue to be scheduled on the + * remote CPU. + */ +void bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) +{ + unsigned int this_cpu = get_cpu(); + struct switch_args args; + + if (cpu == this_cpu) { + bL_switch_to(new_cluster_id); + put_cpu(); + return; + } + put_cpu(); + + args.cluster = new_cluster_id; + INIT_WORK_ONSTACK(&args.work, __bL_switch_to); + schedule_work_on(cpu, &args.work); + flush_work(&args.work); +} +EXPORT_SYMBOL_GPL(bL_switch_request); diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h new file mode 100644 index 0000000..72efe3f --- /dev/null +++ b/arch/arm/include/asm/bL_switcher.h @@ -0,0 +1,17 @@ +/* + * arch/arm/include/asm/bL_switcher.h + * + * Created by: Nicolas Pitre, April 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef ASM_BL_SWITCHER_H +#define ASM_BL_SWITCHER_H + +void bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); + +#endif -- cgit v0.10.2 From 3f09d4799ecc076cccc11ab2333a36ec849d24f5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 16 May 2012 15:55:54 +0100 Subject: ARM: bL_switcher: add clockevent save/restore support Per-CPU timers that are shutdown when a CPU is switched over must be disabled upon switching and reprogrammed on the inbound CPU by relying on the clock events management API. save/restore sequence is executed with irqs disabled as mandated by the clock events API. The next_event is an absolute time, hence, when the inbound CPU resumes, if the timer has expired the min delta is forced into the tick device to fire after few cycles. This patch adds switching support for clock events that are per-CPU and have to be migrated when a switch takes place; the cpumask of the clock event device is checked against the cpumask of the current cpu, and if they match, the clockevent device mode is saved and it is put in shutdown mode. Resume code reprogrammes the tick device accordingly. Tested on A15/A7 fast models and architected timers. Signed-off-by: Lorenzo Pieralisi Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index f8f2e96..ca04b53 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -15,7 +15,11 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include @@ -121,6 +125,8 @@ static int bL_switchpoint(unsigned long _arg) static int bL_switch_to(unsigned int new_cluster_id) { unsigned int mpidr, cpuid, clusterid, ob_cluster, ib_cluster, this_cpu; + struct tick_device *tdev; + enum clock_event_mode tdev_mode; int ret; mpidr = read_mpidr(); @@ -166,6 +172,14 @@ static int bL_switch_to(unsigned int new_cluster_id) */ arch_send_wakeup_ipi_mask(cpumask_of(this_cpu)); + tdev = tick_get_device(this_cpu); + if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu))) + tdev = NULL; + if (tdev) { + tdev_mode = tdev->evtdev->mode; + clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + } + ret = cpu_pm_enter(); /* we can not tolerate errors at this point */ @@ -191,6 +205,12 @@ static int bL_switch_to(unsigned int new_cluster_id) ret = cpu_pm_exit(); + if (tdev) { + clockevents_set_mode(tdev->evtdev, tdev_mode); + clockevents_program_event(tdev->evtdev, + tdev->evtdev->next_event, 1); + } + local_fiq_enable(); local_irq_enable(); -- cgit v0.10.2 From 71ce1deeff8f9341ae3b21983e9bdde28e8c96fe Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 26 Oct 2012 02:36:17 -0400 Subject: ARM: bL_switcher: move to dedicated threads rather than workqueues The workqueues are problematic as they may be contended. They can't be scheduled with top priority either. Also the optimization in bL_switch_request() to skip the workqueue entirely when the target CPU and the calling CPU were the same didn't allow for bL_switch_request() to be called from atomic context, as might be the case for some cpufreq drivers. Let's move to dedicated kthreads instead. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index ca04b53..407c4cc 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -15,8 +15,10 @@ #include #include #include +#include #include -#include +#include +#include #include #include #include @@ -219,15 +221,48 @@ static int bL_switch_to(unsigned int new_cluster_id) return ret; } -struct switch_args { - unsigned int cluster; - struct work_struct work; +struct bL_thread { + struct task_struct *task; + wait_queue_head_t wq; + int wanted_cluster; }; -static void __bL_switch_to(struct work_struct *work) +static struct bL_thread bL_threads[NR_CPUS]; + +static int bL_switcher_thread(void *arg) +{ + struct bL_thread *t = arg; + struct sched_param param = { .sched_priority = 1 }; + int cluster; + + sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + + do { + if (signal_pending(current)) + flush_signals(current); + wait_event_interruptible(t->wq, + t->wanted_cluster != -1 || + kthread_should_stop()); + cluster = xchg(&t->wanted_cluster, -1); + if (cluster != -1) + bL_switch_to(cluster); + } while (!kthread_should_stop()); + + return 0; +} + +static struct task_struct * __init bL_switcher_thread_create(int cpu, void *arg) { - struct switch_args *args = container_of(work, struct switch_args, work); - bL_switch_to(args->cluster); + struct task_struct *task; + + task = kthread_create_on_node(bL_switcher_thread, arg, + cpu_to_node(cpu), "kswitcher_%d", cpu); + if (!IS_ERR(task)) { + kthread_bind(task, cpu); + wake_up_process(task); + } else + pr_err("%s failed for CPU %d\n", __func__, cpu); + return task; } /* @@ -236,26 +271,46 @@ static void __bL_switch_to(struct work_struct *work) * @cpu: the CPU to switch * @new_cluster_id: the ID of the cluster to switch to. * - * This function causes a cluster switch on the given CPU. If the given - * CPU is the same as the calling CPU then the switch happens right away. - * Otherwise the request is put on a work queue to be scheduled on the - * remote CPU. + * This function causes a cluster switch on the given CPU by waking up + * the appropriate switcher thread. This function may or may not return + * before the switch has occurred. */ -void bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) +int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) { - unsigned int this_cpu = get_cpu(); - struct switch_args args; + struct bL_thread *t; - if (cpu == this_cpu) { - bL_switch_to(new_cluster_id); - put_cpu(); - return; + if (cpu >= ARRAY_SIZE(bL_threads)) { + pr_err("%s: cpu %d out of bounds\n", __func__, cpu); + return -EINVAL; } - put_cpu(); - args.cluster = new_cluster_id; - INIT_WORK_ONSTACK(&args.work, __bL_switch_to); - schedule_work_on(cpu, &args.work); - flush_work(&args.work); + t = &bL_threads[cpu]; + if (IS_ERR(t->task)) + return PTR_ERR(t->task); + if (!t->task) + return -ESRCH; + + t->wanted_cluster = new_cluster_id; + wake_up(&t->wq); + return 0; } EXPORT_SYMBOL_GPL(bL_switch_request); + +static int __init bL_switcher_init(void) +{ + int cpu; + + pr_info("big.LITTLE switcher initializing\n"); + + for_each_online_cpu(cpu) { + struct bL_thread *t = &bL_threads[cpu]; + init_waitqueue_head(&t->wq); + t->wanted_cluster = -1; + t->task = bL_switcher_thread_create(cpu, t); + } + + pr_info("big.LITTLE switcher initialized\n"); + return 0; +} + +late_initcall(bL_switcher_init); diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index 72efe3f..e0c0bba 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -12,6 +12,6 @@ #ifndef ASM_BL_SWITCHER_H #define ASM_BL_SWITCHER_H -void bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); +int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); #endif -- cgit v0.10.2 From c052de2693498bee6ff2e1b5bcd32309c4f8780b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 Nov 2012 15:55:33 -0500 Subject: ARM: bL_switcher: simplify stack isolation We now have a dedicated thread for each logical CPU. That's plenty of stack space for our needs. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 407c4cc..4caca71 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -53,12 +53,6 @@ static void bL_do_switch(void *_unused) { unsigned mpidr, cpuid, clusterid, ob_cluster, ib_cluster; - /* - * We now have a piece of stack borrowed from the init task's. - * Let's also switch to init_mm right away to match it. - */ - cpu_switch_mm(init_mm.pgd, &init_mm); - pr_debug("%s\n", __func__); mpidr = read_mpidr(); @@ -93,22 +87,21 @@ static void bL_do_switch(void *_unused) } /* - * Stack isolation. To ensure 'current' remains valid, we just borrow - * a slice of the init/idle task which should be fairly lightly used. - * The borrowed area starts just above the thread_info structure located - * at the very bottom of the stack, aligned to a cache line. + * Stack isolation. To ensure 'current' remains valid, we just use another + * piece of our thread's stack space which should be fairly lightly used. + * The selected area starts just above the thread_info structure located + * at the very bottom of the stack, aligned to a cache line, and indexed + * with the cluster number. */ -#define STACK_SIZE 256 +#define STACK_SIZE 512 extern void call_with_stack(void (*fn)(void *), void *arg, void *sp); static int bL_switchpoint(unsigned long _arg) { unsigned int mpidr = read_mpidr(); - unsigned int cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); - unsigned int cpu_index = cpuid + clusterid * MAX_CPUS_PER_CLUSTER; - void *stack = &init_thread_info + 1; + void *stack = current_thread_info() + 1; stack = PTR_ALIGN(stack, L1_CACHE_BYTES); - stack += cpu_index * STACK_SIZE + STACK_SIZE; + stack += clusterid * STACK_SIZE + STACK_SIZE; call_with_stack(bL_do_switch, (void *)_arg, stack); BUG(); } -- cgit v0.10.2 From 9797a0e95ead7bfe52260c369ee9fe6ba445afaf Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 21 Nov 2012 11:53:27 -0500 Subject: ARM: bL_switcher: hot-unplug half of the available CPUs In a regular kernel configuration, all the CPUs are initially available. But the switcher execution model uses half of them at any time. Instead of hacking the DTB to remove half of the CPUs, let's remove them at run time and make sure we still have a working switcher configuration. This way, the same DTB can be used whether or not the switcher is used. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 4caca71..50e95d89 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -289,18 +289,94 @@ int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) } EXPORT_SYMBOL_GPL(bL_switch_request); +/* + * Activation and configuration code. + */ + +static cpumask_t bL_switcher_removed_logical_cpus; + +static void __init bL_switcher_restore_cpus(void) +{ + int i; + + for_each_cpu(i, &bL_switcher_removed_logical_cpus) + cpu_up(i); +} + +static int __init bL_switcher_halve_cpus(void) +{ + int cpu, cluster, i, ret; + cpumask_t cluster_mask[2], common_mask; + + cpumask_clear(&bL_switcher_removed_logical_cpus); + cpumask_clear(&cluster_mask[0]); + cpumask_clear(&cluster_mask[1]); + + for_each_online_cpu(i) { + cpu = cpu_logical_map(i) & 0xff; + cluster = (cpu_logical_map(i) >> 8) & 0xff; + if (cluster >= 2) { + pr_err("%s: only dual cluster systems are supported\n", __func__); + return -EINVAL; + } + cpumask_set_cpu(cpu, &cluster_mask[cluster]); + } + + if (!cpumask_and(&common_mask, &cluster_mask[0], &cluster_mask[1])) { + pr_err("%s: no common set of CPUs\n", __func__); + return -EINVAL; + } + + for_each_online_cpu(i) { + cpu = cpu_logical_map(i) & 0xff; + cluster = (cpu_logical_map(i) >> 8) & 0xff; + + if (cpumask_test_cpu(cpu, &common_mask)) { + /* + * We keep only those logical CPUs which number + * is equal to their physical CPU number. This is + * not perfect but good enough for now. + */ + if (cpu == i) + continue; + } + + ret = cpu_down(i); + if (ret) { + bL_switcher_restore_cpus(); + return ret; + } + cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus); + } + + return 0; +} + static int __init bL_switcher_init(void) { - int cpu; + int cpu, ret; pr_info("big.LITTLE switcher initializing\n"); + if (MAX_NR_CLUSTERS != 2) { + pr_err("%s: only dual cluster systems are supported\n", __func__); + return -EINVAL; + } + + cpu_hotplug_driver_lock(); + ret = bL_switcher_halve_cpus(); + if (ret) { + cpu_hotplug_driver_unlock(); + return ret; + } + for_each_online_cpu(cpu) { struct bL_thread *t = &bL_threads[cpu]; init_waitqueue_head(&t->wq); t->wanted_cluster = -1; t->task = bL_switcher_thread_create(cpu, t); } + cpu_hotplug_driver_unlock(); pr_info("big.LITTLE switcher initialized\n"); return 0; -- cgit v0.10.2 From ed96762e3241f57aa812977cf1920d3ee0363f4d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 5 Jul 2012 21:33:26 -0400 Subject: ARM: bL_switcher: do not hardcode GIC IDs in the code Currently, GIC IDs are hardcoded making the code dependent on the 4+4 b.L configuration. Let's allow for GIC IDs to be discovered upon switcher initialization to support other b.L configurations such as the 1+1 one, or 2+3 as on the VExpress TC2. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 50e95d89..1c2e5bc 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -110,6 +110,8 @@ static int bL_switchpoint(unsigned long _arg) * Generic switcher interface */ +static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS]; + /* * bL_switch_to - Switch to a specific cluster for the current CPU * @new_cluster_id: the ID of the cluster to switch to. @@ -159,7 +161,7 @@ static int bL_switch_to(unsigned int new_cluster_id) this_cpu = smp_processor_id(); /* redirect GIC's SGIs to our counterpart */ - gic_migrate_target(cpuid + ib_cluster*4); + gic_migrate_target(bL_gic_id[cpuid][ib_cluster]); /* * Raise a SGI on the inbound CPU to make sure it doesn't stall @@ -332,6 +334,16 @@ static int __init bL_switcher_halve_cpus(void) cluster = (cpu_logical_map(i) >> 8) & 0xff; if (cpumask_test_cpu(cpu, &common_mask)) { + /* Let's take note of the GIC ID for this CPU */ + int gic_id = gic_get_cpu_id(i); + if (gic_id < 0) { + pr_err("%s: bad GIC ID for CPU %d\n", __func__, i); + return -EINVAL; + } + bL_gic_id[cpu][cluster] = gic_id; + pr_info("GIC ID for CPU %u cluster %u is %u\n", + cpu, cluster, gic_id); + /* * We keep only those logical CPUs which number * is equal to their physical CPU number. This is diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 268874a..3862cb5 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -668,6 +668,27 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) #ifdef CONFIG_BL_SWITCHER /* + * gic_get_cpu_id - get the CPU interface ID for the specified CPU + * + * @cpu: the logical CPU number to get the GIC ID for. + * + * Return the CPU interface ID for the given logical CPU number, + * or -1 if the CPU number is too large or the interface ID is + * unknown (more than one bit set). + */ +int gic_get_cpu_id(unsigned int cpu) +{ + unsigned int cpu_bit; + + if (cpu >= NR_GIC_CPU_IF) + return -1; + cpu_bit = gic_cpu_map[cpu]; + if (cpu_bit & (cpu_bit - 1)) + return -1; + return __ffs(cpu_bit); +} + +/* * gic_migrate_target - migrate IRQs to another CPU interface * * @new_cpu_id: the CPU target ID to migrate IRQs to diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 40bfcac..2d7d47e 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -75,6 +75,7 @@ static inline void gic_init(unsigned int nr, int start, gic_init_bases(nr, start, dist, cpu, 0, NULL); } +int gic_get_cpu_id(unsigned int cpu); void gic_migrate_target(unsigned int new_cpu_id); #endif /* __ASSEMBLY */ -- cgit v0.10.2 From 6b7437aed1568076cefa4d42747b1515dcb848db Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 22 Nov 2012 00:05:07 -0500 Subject: ARM: bL_switcher: ability to enable and disable the switcher via sysfs The /sys/kernel/bL_switcher/enable file allows to enable or disable the switcher by writing 1 or 0 to it respectively. It is still enabled by default on boot. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 1c2e5bc..395f60f 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -220,6 +221,7 @@ struct bL_thread { struct task_struct *task; wait_queue_head_t wq; int wanted_cluster; + struct completion started; }; static struct bL_thread bL_threads[NR_CPUS]; @@ -231,6 +233,7 @@ static int bL_switcher_thread(void *arg) int cluster; sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + complete(&t->started); do { if (signal_pending(current)) @@ -246,7 +249,7 @@ static int bL_switcher_thread(void *arg) return 0; } -static struct task_struct * __init bL_switcher_thread_create(int cpu, void *arg) +static struct task_struct *bL_switcher_thread_create(int cpu, void *arg) { struct task_struct *task; @@ -295,9 +298,11 @@ EXPORT_SYMBOL_GPL(bL_switch_request); * Activation and configuration code. */ +static unsigned int bL_switcher_active; +static unsigned int bL_switcher_cpu_original_cluster[MAX_CPUS_PER_CLUSTER]; static cpumask_t bL_switcher_removed_logical_cpus; -static void __init bL_switcher_restore_cpus(void) +static void bL_switcher_restore_cpus(void) { int i; @@ -305,7 +310,7 @@ static void __init bL_switcher_restore_cpus(void) cpu_up(i); } -static int __init bL_switcher_halve_cpus(void) +static int bL_switcher_halve_cpus(void) { int cpu, cluster, i, ret; cpumask_t cluster_mask[2], common_mask; @@ -349,8 +354,10 @@ static int __init bL_switcher_halve_cpus(void) * is equal to their physical CPU number. This is * not perfect but good enough for now. */ - if (cpu == i) + if (cpu == i) { + bL_switcher_cpu_original_cluster[cpu] = cluster; continue; + } } ret = cpu_down(i); @@ -364,18 +371,18 @@ static int __init bL_switcher_halve_cpus(void) return 0; } -static int __init bL_switcher_init(void) +static int bL_switcher_enable(void) { int cpu, ret; - pr_info("big.LITTLE switcher initializing\n"); - - if (MAX_NR_CLUSTERS != 2) { - pr_err("%s: only dual cluster systems are supported\n", __func__); - return -EINVAL; + cpu_hotplug_driver_lock(); + if (bL_switcher_active) { + cpu_hotplug_driver_unlock(); + return 0; } - cpu_hotplug_driver_lock(); + pr_info("big.LITTLE switcher initializing\n"); + ret = bL_switcher_halve_cpus(); if (ret) { cpu_hotplug_driver_unlock(); @@ -385,13 +392,155 @@ static int __init bL_switcher_init(void) for_each_online_cpu(cpu) { struct bL_thread *t = &bL_threads[cpu]; init_waitqueue_head(&t->wq); + init_completion(&t->started); t->wanted_cluster = -1; t->task = bL_switcher_thread_create(cpu, t); } + + bL_switcher_active = 1; cpu_hotplug_driver_unlock(); pr_info("big.LITTLE switcher initialized\n"); return 0; } +#ifdef CONFIG_SYSFS + +static void bL_switcher_disable(void) +{ + unsigned int cpu, cluster, i; + struct bL_thread *t; + struct task_struct *task; + + cpu_hotplug_driver_lock(); + if (!bL_switcher_active) { + cpu_hotplug_driver_unlock(); + return; + } + bL_switcher_active = 0; + + /* + * To deactivate the switcher, we must shut down the switcher + * threads to prevent any other requests from being accepted. + * Then, if the final cluster for given logical CPU is not the + * same as the original one, we'll recreate a switcher thread + * just for the purpose of switching the CPU back without any + * possibility for interference from external requests. + */ + for_each_online_cpu(cpu) { + BUG_ON(cpu != (cpu_logical_map(cpu) & 0xff)); + t = &bL_threads[cpu]; + task = t->task; + t->task = NULL; + if (!task || IS_ERR(task)) + continue; + kthread_stop(task); + /* no more switch may happen on this CPU at this point */ + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1); + if (cluster == bL_switcher_cpu_original_cluster[cpu]) + continue; + init_completion(&t->started); + t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu]; + task = bL_switcher_thread_create(cpu, t); + if (!IS_ERR(task)) { + wait_for_completion(&t->started); + kthread_stop(task); + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1); + if (cluster == bL_switcher_cpu_original_cluster[cpu]) + continue; + } + /* If execution gets here, we're in trouble. */ + pr_crit("%s: unable to restore original cluster for CPU %d\n", + __func__, cpu); + for_each_cpu(i, &bL_switcher_removed_logical_cpus) { + if ((cpu_logical_map(i) & 0xff) != cpu) + continue; + pr_crit("%s: CPU %d can't be restored\n", + __func__, i); + cpumask_clear_cpu(i, &bL_switcher_removed_logical_cpus); + break; + } + } + + bL_switcher_restore_cpus(); + cpu_hotplug_driver_unlock(); +} + +static ssize_t bL_switcher_active_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", bL_switcher_active); +} + +static ssize_t bL_switcher_active_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + + switch (buf[0]) { + case '0': + bL_switcher_disable(); + ret = 0; + break; + case '1': + ret = bL_switcher_enable(); + break; + default: + ret = -EINVAL; + } + + return (ret >= 0) ? count : ret; +} + +static struct kobj_attribute bL_switcher_active_attr = + __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store); + +static struct attribute *bL_switcher_attrs[] = { + &bL_switcher_active_attr.attr, + NULL, +}; + +static struct attribute_group bL_switcher_attr_group = { + .attrs = bL_switcher_attrs, +}; + +static struct kobject *bL_switcher_kobj; + +static int __init bL_switcher_sysfs_init(void) +{ + int ret; + + bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj); + if (!bL_switcher_kobj) + return -ENOMEM; + ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group); + if (ret) + kobject_put(bL_switcher_kobj); + return ret; +} + +#endif /* CONFIG_SYSFS */ + +static int __init bL_switcher_init(void) +{ + int ret; + + if (MAX_NR_CLUSTERS != 2) { + pr_err("%s: only dual cluster systems are supported\n", __func__); + return -EINVAL; + } + + ret = bL_switcher_enable(); + if (ret) + return ret; + +#ifdef CONFIG_SYSFS + ret = bL_switcher_sysfs_init(); + if (ret) + pr_err("%s: unable to create sysfs entry\n", __func__); +#endif + + return 0; +} + late_initcall(bL_switcher_init); -- cgit v0.10.2 From c4821c0575a3b1bf26f100230dc2938297d7043b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 22 Nov 2012 13:33:35 -0500 Subject: ARM: bL_switcher: add kernel cmdline param to disable the switcher on boot By adding no_bL_switcher to the kernel cmdline string, the switcher won't be activated automatically at boot time. It is still possible to activate it later with: echo 1 > /sys/kernel/bL_switcher/active Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 395f60f..cec825e 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -521,6 +522,9 @@ static int __init bL_switcher_sysfs_init(void) #endif /* CONFIG_SYSFS */ +static bool no_bL_switcher; +core_param(no_bL_switcher, no_bL_switcher, bool, 0644); + static int __init bL_switcher_init(void) { int ret; @@ -530,9 +534,11 @@ static int __init bL_switcher_init(void) return -EINVAL; } - ret = bL_switcher_enable(); - if (ret) - return ret; + if (!no_bL_switcher) { + ret = bL_switcher_enable(); + if (ret) + return ret; + } #ifdef CONFIG_SYSFS ret = bL_switcher_sysfs_init(); -- cgit v0.10.2 From 38c35d4f2e408c369e3030f0717d35ad443d9223 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 13 Jun 2013 23:42:46 -0400 Subject: ARM: bL_switcher: remove assumptions between logical and physical CPUs Up to now, the logical CPU was somehow tied to the physical CPU number within a cluster. This causes problems when forcing the boot CPU to be different from the first enumerated CPU in the device tree creating a discrepancy between logical and physical CPU numbers. Let's make the pairing completely independent from physical CPU numbers. Let's keep only those logical CPUs with same initial CPU cluster to create a uniform scheduler profile without having to modify any of the probed topology and compute capacity data. This has the potential to create a non contiguous CPU numbering space when the switcher is active with potential impact on buggy user space tools. It is however better to fix those tools rather than making the switcher code more intrusive. Signed-off-by: Nicolas Pitre Reviewed-by: Lorenzo Pieralisi diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index cec825e..0e4fb3e 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -53,21 +53,19 @@ static int read_mpidr(void) static void bL_do_switch(void *_unused) { - unsigned mpidr, cpuid, clusterid, ob_cluster, ib_cluster; + unsigned ib_mpidr, ib_cpu, ib_cluster; pr_debug("%s\n", __func__); - mpidr = read_mpidr(); - cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); - clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); - ob_cluster = clusterid; - ib_cluster = clusterid ^ 1; + ib_mpidr = cpu_logical_map(smp_processor_id()); + ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0); + ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1); /* * Our state has been saved at this point. Let's release our * inbound CPU. */ - mcpm_set_entry_vector(cpuid, ib_cluster, cpu_resume); + mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume); sev(); /* @@ -113,6 +111,7 @@ static int bL_switchpoint(unsigned long _arg) */ static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS]; +static int bL_switcher_cpu_pairing[NR_CPUS]; /* * bL_switch_to - Switch to a specific cluster for the current CPU @@ -123,31 +122,38 @@ static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS]; */ static int bL_switch_to(unsigned int new_cluster_id) { - unsigned int mpidr, cpuid, clusterid, ob_cluster, ib_cluster, this_cpu; + unsigned int mpidr, this_cpu, that_cpu; + unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; struct tick_device *tdev; enum clock_event_mode tdev_mode; int ret; - mpidr = read_mpidr(); - cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); - clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); - ob_cluster = clusterid; - ib_cluster = clusterid ^ 1; + this_cpu = smp_processor_id(); + ob_mpidr = read_mpidr(); + ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0); + ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1); + BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr); - if (new_cluster_id == clusterid) + if (new_cluster_id == ob_cluster) return 0; - pr_debug("before switch: CPU %d in cluster %d\n", cpuid, clusterid); + that_cpu = bL_switcher_cpu_pairing[this_cpu]; + ib_mpidr = cpu_logical_map(that_cpu); + ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0); + ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1); + + pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n", + this_cpu, ob_mpidr, ib_mpidr); /* Close the gate for our entry vectors */ - mcpm_set_entry_vector(cpuid, ob_cluster, NULL); - mcpm_set_entry_vector(cpuid, ib_cluster, NULL); + mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL); + mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL); /* * Let's wake up the inbound CPU now in case it requires some delay * to come online, but leave it gated in our entry vector code. */ - ret = mcpm_cpu_power_up(cpuid, ib_cluster); + ret = mcpm_cpu_power_up(ib_cpu, ib_cluster); if (ret) { pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret); return ret; @@ -160,10 +166,8 @@ static int bL_switch_to(unsigned int new_cluster_id) local_irq_disable(); local_fiq_disable(); - this_cpu = smp_processor_id(); - /* redirect GIC's SGIs to our counterpart */ - gic_migrate_target(bL_gic_id[cpuid][ib_cluster]); + gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]); /* * Raise a SGI on the inbound CPU to make sure it doesn't stall @@ -185,8 +189,9 @@ static int bL_switch_to(unsigned int new_cluster_id) if (ret) panic("%s: cpu_pm_enter() returned %d\n", __func__, ret); - /* Flip the cluster in the CPU logical map for this CPU. */ - cpu_logical_map(this_cpu) ^= (1 << 8); + /* Swap the physical CPUs in the logical map for this logical CPU. */ + cpu_logical_map(this_cpu) = ib_mpidr; + cpu_logical_map(that_cpu) = ob_mpidr; /* Let's do the actual CPU switch. */ ret = cpu_suspend(0, bL_switchpoint); @@ -195,10 +200,8 @@ static int bL_switch_to(unsigned int new_cluster_id) /* We are executing on the inbound CPU at this point */ mpidr = read_mpidr(); - cpuid = MPIDR_AFFINITY_LEVEL(mpidr, 0); - clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1); - pr_debug("after switch: CPU %d in cluster %d\n", cpuid, clusterid); - BUG_ON(clusterid != ib_cluster); + pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr); + BUG_ON(mpidr != ib_mpidr); mcpm_cpu_powered_up(); @@ -300,7 +303,7 @@ EXPORT_SYMBOL_GPL(bL_switch_request); */ static unsigned int bL_switcher_active; -static unsigned int bL_switcher_cpu_original_cluster[MAX_CPUS_PER_CLUSTER]; +static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS]; static cpumask_t bL_switcher_removed_logical_cpus; static void bL_switcher_restore_cpus(void) @@ -313,52 +316,86 @@ static void bL_switcher_restore_cpus(void) static int bL_switcher_halve_cpus(void) { - int cpu, cluster, i, ret; - cpumask_t cluster_mask[2], common_mask; - - cpumask_clear(&bL_switcher_removed_logical_cpus); - cpumask_clear(&cluster_mask[0]); - cpumask_clear(&cluster_mask[1]); + int i, j, cluster_0, gic_id, ret; + unsigned int cpu, cluster, mask; + cpumask_t available_cpus; + /* First pass to validate what we have */ + mask = 0; for_each_online_cpu(i) { - cpu = cpu_logical_map(i) & 0xff; - cluster = (cpu_logical_map(i) >> 8) & 0xff; + cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0); + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); if (cluster >= 2) { pr_err("%s: only dual cluster systems are supported\n", __func__); return -EINVAL; } - cpumask_set_cpu(cpu, &cluster_mask[cluster]); + if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER)) + return -EINVAL; + mask |= (1 << cluster); } - - if (!cpumask_and(&common_mask, &cluster_mask[0], &cluster_mask[1])) { - pr_err("%s: no common set of CPUs\n", __func__); + if (mask != 3) { + pr_err("%s: no CPU pairing possible\n", __func__); return -EINVAL; } - for_each_online_cpu(i) { - cpu = cpu_logical_map(i) & 0xff; - cluster = (cpu_logical_map(i) >> 8) & 0xff; - - if (cpumask_test_cpu(cpu, &common_mask)) { - /* Let's take note of the GIC ID for this CPU */ - int gic_id = gic_get_cpu_id(i); - if (gic_id < 0) { - pr_err("%s: bad GIC ID for CPU %d\n", __func__, i); - return -EINVAL; - } - bL_gic_id[cpu][cluster] = gic_id; - pr_info("GIC ID for CPU %u cluster %u is %u\n", - cpu, cluster, gic_id); - + /* + * Now let's do the pairing. We match each CPU with another CPU + * from a different cluster. To get a uniform scheduling behavior + * without fiddling with CPU topology and compute capacity data, + * we'll use logical CPUs initially belonging to the same cluster. + */ + memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing)); + cpumask_copy(&available_cpus, cpu_online_mask); + cluster_0 = -1; + for_each_cpu(i, &available_cpus) { + int match = -1; + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); + if (cluster_0 == -1) + cluster_0 = cluster; + if (cluster != cluster_0) + continue; + cpumask_clear_cpu(i, &available_cpus); + for_each_cpu(j, &available_cpus) { + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1); /* - * We keep only those logical CPUs which number - * is equal to their physical CPU number. This is - * not perfect but good enough for now. + * Let's remember the last match to create "odd" + * pairings on purpose in order for other code not + * to assume any relation between physical and + * logical CPU numbers. */ - if (cpu == i) { - bL_switcher_cpu_original_cluster[cpu] = cluster; - continue; - } + if (cluster != cluster_0) + match = j; + } + if (match != -1) { + bL_switcher_cpu_pairing[i] = match; + cpumask_clear_cpu(match, &available_cpus); + pr_info("CPU%d paired with CPU%d\n", i, match); + } + } + + /* + * Now we disable the unwanted CPUs i.e. everything that has no + * pairing information (that includes the pairing counterparts). + */ + cpumask_clear(&bL_switcher_removed_logical_cpus); + for_each_online_cpu(i) { + cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0); + cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1); + + /* Let's take note of the GIC ID for this CPU */ + gic_id = gic_get_cpu_id(i); + if (gic_id < 0) { + pr_err("%s: bad GIC ID for CPU %d\n", __func__, i); + bL_switcher_restore_cpus(); + return -EINVAL; + } + bL_gic_id[cpu][cluster] = gic_id; + pr_info("GIC ID for CPU %u cluster %u is %u\n", + cpu, cluster, gic_id); + + if (bL_switcher_cpu_pairing[i] != -1) { + bL_switcher_cpu_original_cluster[i] = cluster; + continue; } ret = cpu_down(i); @@ -409,7 +446,7 @@ static int bL_switcher_enable(void) static void bL_switcher_disable(void) { - unsigned int cpu, cluster, i; + unsigned int cpu, cluster; struct bL_thread *t; struct task_struct *task; @@ -429,7 +466,6 @@ static void bL_switcher_disable(void) * possibility for interference from external requests. */ for_each_online_cpu(cpu) { - BUG_ON(cpu != (cpu_logical_map(cpu) & 0xff)); t = &bL_threads[cpu]; task = t->task; t->task = NULL; @@ -453,14 +489,10 @@ static void bL_switcher_disable(void) /* If execution gets here, we're in trouble. */ pr_crit("%s: unable to restore original cluster for CPU %d\n", __func__, cpu); - for_each_cpu(i, &bL_switcher_removed_logical_cpus) { - if ((cpu_logical_map(i) & 0xff) != cpu) - continue; - pr_crit("%s: CPU %d can't be restored\n", - __func__, i); - cpumask_clear_cpu(i, &bL_switcher_removed_logical_cpus); - break; - } + pr_crit("%s: CPU %d can't be restored\n", + __func__, bL_switcher_cpu_pairing[cpu]); + cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu], + &bL_switcher_removed_logical_cpus); } bL_switcher_restore_cpus(); -- cgit v0.10.2 From 272614351423ce8c37ff730efc130e5b73fe64f5 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 26 Nov 2012 22:48:55 -0500 Subject: ARM: bL_switcher: filter CPU hotplug requests when the switcher is active Trying to support both the switcher and CPU hotplug at the same time is tricky due to ambiguous semantics. So let's at least prevent users from messing around with those logical CPUs the switcher has removed and those which were not active when the switcher was activated. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 0e4fb3e..335ff76 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -554,6 +554,26 @@ static int __init bL_switcher_sysfs_init(void) #endif /* CONFIG_SYSFS */ +/* + * Veto any CPU hotplug operation on those CPUs we've removed + * while the switcher is active. + * We're just not ready to deal with that given the trickery involved. + */ +static int bL_switcher_hotplug_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + if (bL_switcher_active) { + int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu]; + switch (action & 0xf) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + if (pairing == -1) + return NOTIFY_BAD; + } + } + return NOTIFY_DONE; +} + static bool no_bL_switcher; core_param(no_bL_switcher, no_bL_switcher, bool, 0644); @@ -566,6 +586,8 @@ static int __init bL_switcher_init(void) return -EINVAL; } + cpu_notifier(bL_switcher_hotplug_callback, 0); + if (!no_bL_switcher) { ret = bL_switcher_enable(); if (ret) -- cgit v0.10.2 From b22537c682671de97c932d5addb6b7d087352aa1 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 12 Apr 2012 03:04:28 -0400 Subject: ARM: bL_switcher: add a simple /dev user interface for debugging purposes Only the basic call to aid debugging. *** NOT FOR PRODUCTION *** Usage: echo , > /dev/b.L_switcher where is the logical CPU number, and is 0 for the first cluster and 1 for the second cluster. Signed-off-by: nicolas Pitre diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ade7001..cfcb0d8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1556,6 +1556,14 @@ config BL_SWITCHER transparently handle transition between a cluster of A15's and a cluster of A7's in a big.LITTLE system. +config BL_SWITCHER_DUMMY_IF + tristate "Simple big.LITTLE switcher user interface" + depends on BL_SWITCHER && DEBUG_KERNEL + help + This is a simple and dummy char dev interface to control + the big.LITTLE switcher core code. It is meant for + debugging purposes only. + choice prompt "Memory split" default VMSPLIT_3G diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile index 2586240..5c8584c 100644 --- a/arch/arm/common/Makefile +++ b/arch/arm/common/Makefile @@ -18,3 +18,4 @@ AFLAGS_mcpm_head.o := -march=armv7-a AFLAGS_vlock.o := -march=armv7-a obj-$(CONFIG_TI_PRIV_EDMA) += edma.o obj-$(CONFIG_BL_SWITCHER) += bL_switcher.o +obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c new file mode 100644 index 0000000..3f47f12 --- /dev/null +++ b/arch/arm/common/bL_switcher_dummy_if.c @@ -0,0 +1,71 @@ +/* + * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface + * + * Created by: Nicolas Pitre, November 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * Dummy interface to user space for debugging purpose only. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +static ssize_t bL_switcher_write(struct file *file, const char __user *buf, + size_t len, loff_t *pos) +{ + unsigned char val[3]; + unsigned int cpu, cluster; + int ret; + + pr_debug("%s\n", __func__); + + if (len < 3) + return -EINVAL; + + if (copy_from_user(val, buf, 3)) + return -EFAULT; + + /* format: , */ + if (val[0] < '0' || val[0] > '9' || + val[1] != ',' || + val[2] < '0' || val[2] > '1') + return -EINVAL; + + cpu = val[0] - '0'; + cluster = val[2] - '0'; + ret = bL_switch_request(cpu, cluster); + + return ret ? : len; +} + +static const struct file_operations bL_switcher_fops = { + .write = bL_switcher_write, + .owner = THIS_MODULE, +}; + +static struct miscdevice bL_switcher_device = { + MISC_DYNAMIC_MINOR, + "b.L_switcher", + &bL_switcher_fops +}; + +static int __init bL_switcher_dummy_if_init(void) +{ + return misc_register(&bL_switcher_device); +} + +static void __exit bL_switcher_dummy_if_exit(void) +{ + misc_deregister(&bL_switcher_device); +} + +module_init(bL_switcher_dummy_if_init); +module_exit(bL_switcher_dummy_if_exit); -- cgit v0.10.2 From edaf6d3d264ae9323a53a49b2a1cf42cc5df1be2 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 4 Sep 2013 08:59:51 +0100 Subject: ARM: 7831/1: mmc: mmci: Adapt to new pinctrl handling There is no need for every driver to fetch a pinctrl handle and to select the default state. Instead this is handled by the device driver core, thus we can remove this piece of code from mmci. Signed-off-by: Ulf Hansson Reviewed-by: Linus Walleij Signed-off-by: Russell King diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index c3785ed..5dffbbd 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1510,23 +1510,6 @@ static int mmci_probe(struct amba_device *dev, mmc->f_max = min(host->mclk, fmax); dev_dbg(mmc_dev(mmc), "clocking block at %u Hz\n", mmc->f_max); - host->pinctrl = devm_pinctrl_get(&dev->dev); - if (IS_ERR(host->pinctrl)) { - ret = PTR_ERR(host->pinctrl); - goto clk_disable; - } - - host->pins_default = pinctrl_lookup_state(host->pinctrl, - PINCTRL_STATE_DEFAULT); - - /* enable pins to be muxed in and configured */ - if (!IS_ERR(host->pins_default)) { - ret = pinctrl_select_state(host->pinctrl, host->pins_default); - if (ret) - dev_warn(&dev->dev, "could not set default pins\n"); - } else - dev_warn(&dev->dev, "could not get default pinstate\n"); - /* Get regulators and the supported OCR mask */ mmc_regulator_get_supply(mmc); if (!mmc->ocr_avail) diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h index 69080fa..168bc72 100644 --- a/drivers/mmc/host/mmci.h +++ b/drivers/mmc/host/mmci.h @@ -200,10 +200,6 @@ struct mmci_host { struct sg_mapping_iter sg_miter; unsigned int size; - /* pinctrl handles */ - struct pinctrl *pinctrl; - struct pinctrl_state *pins_default; - #ifdef CONFIG_DMA_ENGINE /* DMA stuff */ struct dma_chan *dma_current; -- cgit v0.10.2 From e36bd9c6c99f47902d6fd6804730aa587ae275a0 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 4 Sep 2013 09:00:37 +0100 Subject: ARM: 7832/1: mmc: mmci: Use optional sleep pinctrl state By optionally putting the pins into sleep state in the .runtime_suspend callback we can accomplish two things. One is to minimize current leakage from pins and thus save power, second we can prevent the IP from driving pins output in an uncontrolled manner, which may happen if the power domain drops the domain regulator. When returning from idle, entering .runtime_resume callback, the pins are restored to default state. Signed-off-by: Ulf Hansson Acked-by: Rickard Andersson Reviewed-by: Linus Walleij Signed-off-by: Russell King diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 5dffbbd..c550b3e 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1750,6 +1750,7 @@ static int mmci_runtime_suspend(struct device *dev) if (mmc) { struct mmci_host *host = mmc_priv(mmc); + pinctrl_pm_select_sleep_state(dev); clk_disable_unprepare(host->clk); } @@ -1764,6 +1765,7 @@ static int mmci_runtime_resume(struct device *dev) if (mmc) { struct mmci_host *host = mmc_priv(mmc); clk_prepare_enable(host->clk); + pinctrl_pm_select_default_state(dev); } return 0; -- cgit v0.10.2 From f829c04204de83aa0d13307d2a2dc07c0d9a94e3 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 4 Sep 2013 09:01:15 +0100 Subject: ARM: 7833/1: mmc: mmci: Adapt to register write restrictions After a write to the MMCICLOCK register data cannot be written to this register for three feedback clock cycles. Writes to the MMCIPOWER register must be separated by three MCLK cycles. Previously no issues has been observered, but using higher ARM clock frequencies on STE- platforms has triggered this problem. The MMCICLOCK register is written to in .set_ios and for some data transmissions for SDIO. We do not need a delay at the data transmission path, because sending and receiving data will require more than three clock cycles. Then we use a simple logic to only delay in .set_ios and thus we don't affect throughput performance. Signed-off-by: Johan Rudholm Signed-off-by: Ulf Hansson Acked-by: Rickard Andersson Reviewed-by: Daniel Lezcano Signed-off-by: Russell King diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index c550b3e..604e41d 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -189,6 +189,21 @@ static int mmci_validate_data(struct mmci_host *host, return 0; } +static void mmci_reg_delay(struct mmci_host *host) +{ + /* + * According to the spec, at least three feedback clock cycles + * of max 52 MHz must pass between two writes to the MMCICLOCK reg. + * Three MCLK clock cycles must pass between two MMCIPOWER reg writes. + * Worst delay time during card init is at 100 kHz => 30 us. + * Worst delay time when up and running is at 25 MHz => 120 ns. + */ + if (host->cclk < 25000000) + udelay(30); + else + ndelay(120); +} + /* * This must be called with host->lock held */ @@ -1264,6 +1279,7 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) mmci_set_clkreg(host, ios->clock); mmci_write_pwrreg(host, pwr); + mmci_reg_delay(host); spin_unlock_irqrestore(&host->lock, flags); -- cgit v0.10.2 From 1ff44433c661c30afde6e6d2c47a29039a293da4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 4 Sep 2013 09:05:17 +0100 Subject: ARM: 7834/1: mmc: mmci: Save and restore register context If a corresponding power domain exists for the device and it manages to cut the domain regulator while the device is runtime suspended, the IP loses it's registers context. We restore the context in the .runtime_resume callback from the existing register caches to adapt to this situation. We also want to make sure the registers are in a known state while restoring context in the case when the power domain did not drop the power, since there are restrictions for the order of writing to these registers. To handle this, we clear the registers in the .runtime_suspend callback. Signed-off-by: Ulf Hansson Acked-by: Rickard Andersson Reviewed-by: Daniel Lezcano Signed-off-by: Russell King diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 604e41d..d135c76 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -62,6 +62,7 @@ static unsigned int fmax = 515633; * @signal_direction: input/out direction of bus signals can be indicated * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock * @busy_detect: true if busy detection on dat0 is supported + * @pwrreg_nopower: bits in MMCIPOWER don't controls ext. power supply */ struct variant_data { unsigned int clkreg; @@ -76,6 +77,7 @@ struct variant_data { bool signal_direction; bool pwrreg_clkgate; bool busy_detect; + bool pwrreg_nopower; }; static struct variant_data variant_arm = { @@ -109,6 +111,7 @@ static struct variant_data variant_u300 = { .pwrreg_powerup = MCI_PWR_ON, .signal_direction = true, .pwrreg_clkgate = true, + .pwrreg_nopower = true, }; static struct variant_data variant_nomadik = { @@ -121,6 +124,7 @@ static struct variant_data variant_nomadik = { .pwrreg_powerup = MCI_PWR_ON, .signal_direction = true, .pwrreg_clkgate = true, + .pwrreg_nopower = true, }; static struct variant_data variant_ux500 = { @@ -135,6 +139,7 @@ static struct variant_data variant_ux500 = { .signal_direction = true, .pwrreg_clkgate = true, .busy_detect = true, + .pwrreg_nopower = true, }; static struct variant_data variant_ux500v2 = { @@ -150,6 +155,7 @@ static struct variant_data variant_ux500v2 = { .signal_direction = true, .pwrreg_clkgate = true, .busy_detect = true, + .pwrreg_nopower = true, }; static int mmci_card_busy(struct mmc_host *mmc) @@ -1759,6 +1765,41 @@ static int mmci_resume(struct device *dev) #endif #ifdef CONFIG_PM_RUNTIME +static void mmci_save(struct mmci_host *host) +{ + unsigned long flags; + + if (host->variant->pwrreg_nopower) { + spin_lock_irqsave(&host->lock, flags); + + writel(0, host->base + MMCIMASK0); + writel(0, host->base + MMCIDATACTRL); + writel(0, host->base + MMCIPOWER); + writel(0, host->base + MMCICLOCK); + mmci_reg_delay(host); + + spin_unlock_irqrestore(&host->lock, flags); + } + +} + +static void mmci_restore(struct mmci_host *host) +{ + unsigned long flags; + + if (host->variant->pwrreg_nopower) { + spin_lock_irqsave(&host->lock, flags); + + writel(host->clk_reg, host->base + MMCICLOCK); + writel(host->datactrl_reg, host->base + MMCIDATACTRL); + writel(host->pwr_reg, host->base + MMCIPOWER); + writel(MCI_IRQENABLE, host->base + MMCIMASK0); + mmci_reg_delay(host); + + spin_unlock_irqrestore(&host->lock, flags); + } +} + static int mmci_runtime_suspend(struct device *dev) { struct amba_device *adev = to_amba_device(dev); @@ -1767,6 +1808,7 @@ static int mmci_runtime_suspend(struct device *dev) if (mmc) { struct mmci_host *host = mmc_priv(mmc); pinctrl_pm_select_sleep_state(dev); + mmci_save(host); clk_disable_unprepare(host->clk); } @@ -1781,6 +1823,7 @@ static int mmci_runtime_resume(struct device *dev) if (mmc) { struct mmci_host *host = mmc_priv(mmc); clk_prepare_enable(host->clk); + mmci_restore(host); pinctrl_pm_select_default_state(dev); } -- cgit v0.10.2 From c0f4375146a738bae23e48fa8b5383abf02177cb Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 10 Dec 2012 17:19:57 +0000 Subject: ARM: bL_switcher: Add synchronous enable/disable interface Some subsystems will need to know for sure whether the switcher is enabled or disabled during certain critical regions. This patch provides a simple mutex-based mechanism to discover whether the switcher is enabled and temporarily lock out further enable/disable: * bL_switcher_get_enabled() returns true iff the switcher is enabled and temporarily inhibits enable/disable. * bL_switcher_put_enabled() permits enable/disable of the switcher again after a previous call to bL_switcher_get_enabled(). Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 335ff76..7d98629 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -302,6 +303,7 @@ EXPORT_SYMBOL_GPL(bL_switch_request); * Activation and configuration code. */ +static DEFINE_MUTEX(bL_switcher_activation_lock); static unsigned int bL_switcher_active; static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS]; static cpumask_t bL_switcher_removed_logical_cpus; @@ -413,9 +415,11 @@ static int bL_switcher_enable(void) { int cpu, ret; + mutex_lock(&bL_switcher_activation_lock); cpu_hotplug_driver_lock(); if (bL_switcher_active) { cpu_hotplug_driver_unlock(); + mutex_unlock(&bL_switcher_activation_lock); return 0; } @@ -424,6 +428,7 @@ static int bL_switcher_enable(void) ret = bL_switcher_halve_cpus(); if (ret) { cpu_hotplug_driver_unlock(); + mutex_unlock(&bL_switcher_activation_lock); return ret; } @@ -436,9 +441,10 @@ static int bL_switcher_enable(void) } bL_switcher_active = 1; - cpu_hotplug_driver_unlock(); - pr_info("big.LITTLE switcher initialized\n"); + + cpu_hotplug_driver_unlock(); + mutex_unlock(&bL_switcher_activation_lock); return 0; } @@ -450,9 +456,11 @@ static void bL_switcher_disable(void) struct bL_thread *t; struct task_struct *task; + mutex_lock(&bL_switcher_activation_lock); cpu_hotplug_driver_lock(); if (!bL_switcher_active) { cpu_hotplug_driver_unlock(); + mutex_unlock(&bL_switcher_activation_lock); return; } bL_switcher_active = 0; @@ -497,6 +505,7 @@ static void bL_switcher_disable(void) bL_switcher_restore_cpus(); cpu_hotplug_driver_unlock(); + mutex_unlock(&bL_switcher_activation_lock); } static ssize_t bL_switcher_active_show(struct kobject *kobj, @@ -554,6 +563,20 @@ static int __init bL_switcher_sysfs_init(void) #endif /* CONFIG_SYSFS */ +bool bL_switcher_get_enabled(void) +{ + mutex_lock(&bL_switcher_activation_lock); + + return bL_switcher_active; +} +EXPORT_SYMBOL_GPL(bL_switcher_get_enabled); + +void bL_switcher_put_enabled(void) +{ + mutex_unlock(&bL_switcher_activation_lock); +} +EXPORT_SYMBOL_GPL(bL_switcher_put_enabled); + /* * Veto any CPU hotplug operation on those CPUs we've removed * while the switcher is active. diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index e0c0bba..05d7c4c 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -14,4 +14,7 @@ int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); +bool bL_switcher_get_enabled(void); +void bL_switcher_put_enabled(void); + #endif -- cgit v0.10.2 From 491990e29f5d285a1b75e74785e3160716b79040 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 10 Dec 2012 17:19:58 +0000 Subject: ARM: bL_switcher: Add runtime control notifier Some subsystems will need to respond synchronously to runtime enabling and disabling of the switcher. This patch adds a dedicated notifier interface to support such subsystems. Pre- and post- enable/disable notifications are sent to registered callbacks, allowing safe transition of non-b.L- transparent subsystems across these control transitions. Notifier callbacks may veto switcher (de)activation on pre notifications only. Post notifications won't revert the action. If enabling or disabling of the switcher fails after the pre-change notification has been sent, subsystems which have registered notifiers can be left in an inappropriate state. This patch sends a suitable post-change notification on failure, indicating that the old state has been reestablished. For example, a failed initialisation will result in the following sequence: BL_NOTIFY_PRE_ENABLE /* switcher initialisation fails */ BL_NOTIFY_POST_DISABLE It is the responsibility of notified subsystems to respond in an appropriate way. Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 7d98629..0164887 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -304,10 +305,34 @@ EXPORT_SYMBOL_GPL(bL_switch_request); */ static DEFINE_MUTEX(bL_switcher_activation_lock); +static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier); static unsigned int bL_switcher_active; static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS]; static cpumask_t bL_switcher_removed_logical_cpus; +int bL_switcher_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&bL_activation_notifier, nb); +} +EXPORT_SYMBOL_GPL(bL_switcher_register_notifier); + +int bL_switcher_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&bL_activation_notifier, nb); +} +EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier); + +static int bL_activation_notify(unsigned long val) +{ + int ret; + + ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL); + if (ret & NOTIFY_STOP_MASK) + pr_err("%s: notifier chain failed with status 0x%x\n", + __func__, ret); + return notifier_to_errno(ret); +} + static void bL_switcher_restore_cpus(void) { int i; @@ -425,12 +450,13 @@ static int bL_switcher_enable(void) pr_info("big.LITTLE switcher initializing\n"); + ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE); + if (ret) + goto error; + ret = bL_switcher_halve_cpus(); - if (ret) { - cpu_hotplug_driver_unlock(); - mutex_unlock(&bL_switcher_activation_lock); - return ret; - } + if (ret) + goto error; for_each_online_cpu(cpu) { struct bL_thread *t = &bL_threads[cpu]; @@ -441,11 +467,18 @@ static int bL_switcher_enable(void) } bL_switcher_active = 1; + bL_activation_notify(BL_NOTIFY_POST_ENABLE); pr_info("big.LITTLE switcher initialized\n"); + goto out; + +error: + pr_warn("big.LITTLE switcher initialization failed\n"); + bL_activation_notify(BL_NOTIFY_POST_DISABLE); +out: cpu_hotplug_driver_unlock(); mutex_unlock(&bL_switcher_activation_lock); - return 0; + return ret; } #ifdef CONFIG_SYSFS @@ -458,11 +491,15 @@ static void bL_switcher_disable(void) mutex_lock(&bL_switcher_activation_lock); cpu_hotplug_driver_lock(); - if (!bL_switcher_active) { - cpu_hotplug_driver_unlock(); - mutex_unlock(&bL_switcher_activation_lock); - return; + + if (!bL_switcher_active) + goto out; + + if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) { + bL_activation_notify(BL_NOTIFY_POST_ENABLE); + goto out; } + bL_switcher_active = 0; /* @@ -504,6 +541,9 @@ static void bL_switcher_disable(void) } bL_switcher_restore_cpus(); + bL_activation_notify(BL_NOTIFY_POST_DISABLE); + +out: cpu_hotplug_driver_unlock(); mutex_unlock(&bL_switcher_activation_lock); } diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index 05d7c4c..b243ca9 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -12,9 +12,53 @@ #ifndef ASM_BL_SWITCHER_H #define ASM_BL_SWITCHER_H +#include +#include + int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); +/* + * Register here to be notified about runtime enabling/disabling of + * the switcher. + * + * The notifier chain is called with the switcher activation lock held: + * the switcher will not be enabled or disabled during callbacks. + * Callbacks must not call bL_switcher_{get,put}_enabled(). + */ +#define BL_NOTIFY_PRE_ENABLE 0 +#define BL_NOTIFY_POST_ENABLE 1 +#define BL_NOTIFY_PRE_DISABLE 2 +#define BL_NOTIFY_POST_DISABLE 3 + +#ifdef CONFIG_BL_SWITCHER + +int bL_switcher_register_notifier(struct notifier_block *nb); +int bL_switcher_unregister_notifier(struct notifier_block *nb); + +/* + * Use these functions to temporarily prevent enabling/disabling of + * the switcher. + * bL_switcher_get_enabled() returns true if the switcher is currently + * enabled. Each call to bL_switcher_get_enabled() must be followed + * by a call to bL_switcher_put_enabled(). These functions are not + * recursive. + */ bool bL_switcher_get_enabled(void); void bL_switcher_put_enabled(void); +#else +static inline int bL_switcher_register_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int bL_switcher_unregister_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline bool bL_switcher_get_enabled(void) { return false; } +static inline void bL_switcher_put_enabled(void) { } +#endif /* CONFIG_BL_SWITCHER */ + #endif -- cgit v0.10.2 From 0577fee283fb385afbcdb78d1f4c398d7326b68f Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 22 May 2013 19:08:16 +0100 Subject: ARM: bL_switcher: Add switch completion callback for bL_switch_request() There is no explicit way to know when a switch started via bL_switch_request() is complete. This can lead to unpredictable behaviour when the switcher is controlled by a subsystem which makes dynamic decisions (such as cpufreq). The CPU PM notifier is not really suitable for signalling completion, because the CPU could get suspended and resumed for other, independent reasons while a switch request is in flight. Adding a whole new notifier for this seems excessive, and may tempt people to put heavyweight code on this path. This patch implements a new bL_switch_request_cb() function that allows for a per-request lightweight callback, private between the switcher and the caller of bL_switch_request_cb(). Overlapping switches on a single CPU are considered incorrect if they are requested via bL_switch_request_cb() with a callback (they will lead to an unpredictable final state without explicit external synchronisation to force the requests into a particular order). Queuing requests robustly would be overkill because only one subsystem should be attempting to control the switcher at any time. Overlapping requests of this kind will be failed with -EBUSY to indicate that the second request won't take effect and the completer will never be called for it. bL_switch_request() is retained as a wrapper round the new function, with the old, fire-and-forget semantics. In this case the last request will always win. The request may still be denied if a previous request with a completer is still pending. Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 0164887..34316be 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -224,10 +226,13 @@ static int bL_switch_to(unsigned int new_cluster_id) } struct bL_thread { + spinlock_t lock; struct task_struct *task; wait_queue_head_t wq; int wanted_cluster; struct completion started; + bL_switch_completion_handler completer; + void *completer_cookie; }; static struct bL_thread bL_threads[NR_CPUS]; @@ -237,6 +242,8 @@ static int bL_switcher_thread(void *arg) struct bL_thread *t = arg; struct sched_param param = { .sched_priority = 1 }; int cluster; + bL_switch_completion_handler completer; + void *completer_cookie; sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); complete(&t->started); @@ -247,9 +254,21 @@ static int bL_switcher_thread(void *arg) wait_event_interruptible(t->wq, t->wanted_cluster != -1 || kthread_should_stop()); - cluster = xchg(&t->wanted_cluster, -1); - if (cluster != -1) + + spin_lock(&t->lock); + cluster = t->wanted_cluster; + completer = t->completer; + completer_cookie = t->completer_cookie; + t->wanted_cluster = -1; + t->completer = NULL; + spin_unlock(&t->lock); + + if (cluster != -1) { bL_switch_to(cluster); + + if (completer) + completer(completer_cookie); + } } while (!kthread_should_stop()); return 0; @@ -270,16 +289,30 @@ static struct task_struct *bL_switcher_thread_create(int cpu, void *arg) } /* - * bL_switch_request - Switch to a specific cluster for the given CPU + * bL_switch_request_cb - Switch to a specific cluster for the given CPU, + * with completion notification via a callback * * @cpu: the CPU to switch * @new_cluster_id: the ID of the cluster to switch to. + * @completer: switch completion callback. if non-NULL, + * @completer(@completer_cookie) will be called on completion of + * the switch, in non-atomic context. + * @completer_cookie: opaque context argument for @completer. * * This function causes a cluster switch on the given CPU by waking up * the appropriate switcher thread. This function may or may not return * before the switch has occurred. + * + * If a @completer callback function is supplied, it will be called when + * the switch is complete. This can be used to determine asynchronously + * when the switch is complete, regardless of when bL_switch_request() + * returns. When @completer is supplied, no new switch request is permitted + * for the affected CPU until after the switch is complete, and @completer + * has returned. */ -int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) +int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id, + bL_switch_completion_handler completer, + void *completer_cookie) { struct bL_thread *t; @@ -289,16 +322,25 @@ int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) } t = &bL_threads[cpu]; + if (IS_ERR(t->task)) return PTR_ERR(t->task); if (!t->task) return -ESRCH; + spin_lock(&t->lock); + if (t->completer) { + spin_unlock(&t->lock); + return -EBUSY; + } + t->completer = completer; + t->completer_cookie = completer_cookie; t->wanted_cluster = new_cluster_id; + spin_unlock(&t->lock); wake_up(&t->wq); return 0; } -EXPORT_SYMBOL_GPL(bL_switch_request); +EXPORT_SYMBOL_GPL(bL_switch_request_cb); /* * Activation and configuration code. @@ -460,6 +502,7 @@ static int bL_switcher_enable(void) for_each_online_cpu(cpu) { struct bL_thread *t = &bL_threads[cpu]; + spin_lock_init(&t->lock); init_waitqueue_head(&t->wq); init_completion(&t->started); t->wanted_cluster = -1; diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index b243ca9..7d1cce8 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -15,7 +15,15 @@ #include #include -int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id); +typedef void (*bL_switch_completion_handler)(void *cookie); + +int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id, + bL_switch_completion_handler completer, + void *completer_cookie); +static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id) +{ + return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL); +} /* * Register here to be notified about runtime enabling/disabling of -- cgit v0.10.2 From 108a9640abfada2599b6cb08c7cc00a4eebf8f8f Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 23 Oct 2012 01:39:08 -0400 Subject: ARM: bL_switcher: synchronize the outbound with the inbound Let's wait for the inbound CPU to come up and snoop some of the outbound CPU cache before bringing the outbound CPU down. That should be more efficient than going down right away. Possible improvements might involve some monitoring of the CCI event counters. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 34316be..aab7c12 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -55,9 +55,10 @@ static int read_mpidr(void) * bL switcher core code. */ -static void bL_do_switch(void *_unused) +static void bL_do_switch(void *_arg) { unsigned ib_mpidr, ib_cpu, ib_cluster; + long volatile handshake, **handshake_ptr = _arg; pr_debug("%s\n", __func__); @@ -65,6 +66,13 @@ static void bL_do_switch(void *_unused) ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0); ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1); + /* Advertise our handshake location */ + if (handshake_ptr) { + handshake = 0; + *handshake_ptr = &handshake; + } else + handshake = -1; + /* * Our state has been saved at this point. Let's release our * inbound CPU. @@ -83,6 +91,14 @@ static void bL_do_switch(void *_unused) * we have none. */ + /* + * Let's wait until our inbound is alive. + */ + while (!handshake) { + wfe(); + smp_mb(); + } + /* Let's put ourself down. */ mcpm_cpu_power_down(); @@ -130,6 +146,7 @@ static int bL_switch_to(unsigned int new_cluster_id) unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; struct tick_device *tdev; enum clock_event_mode tdev_mode; + long volatile *handshake_ptr; int ret; this_cpu = smp_processor_id(); @@ -198,7 +215,7 @@ static int bL_switch_to(unsigned int new_cluster_id) cpu_logical_map(that_cpu) = ob_mpidr; /* Let's do the actual CPU switch. */ - ret = cpu_suspend(0, bL_switchpoint); + ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint); if (ret > 0) panic("%s: cpu_suspend() returned %d\n", __func__, ret); @@ -220,6 +237,9 @@ static int bL_switch_to(unsigned int new_cluster_id) local_fiq_enable(); local_irq_enable(); + *handshake_ptr = 1; + dsb_sev(); + if (ret) pr_err("%s exiting with error %d\n", __func__, ret); return ret; -- cgit v0.10.2 From 5135d875e1457ef946a055003d8f80713e862135 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 Nov 2012 21:54:41 -0500 Subject: ARM: SMP: basic IPI triggered completion support We need a mechanism to let an inbound CPU signal that it is alive before even getting into the kernel environment i.e. from early assembly code. Using an IPI is the simplest way to achieve that. This adds some basic infrastructure to register a struct completion pointer to be "completed" when the dedicated IPI for this task is received. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index 2740c2a..3d7351c 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -5,7 +5,7 @@ #include #include -#define NR_IPI 6 +#define NR_IPI 7 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm/include/asm/smp.h b/arch/arm/include/asm/smp.h index a8cae71c..22a3b9b 100644 --- a/arch/arm/include/asm/smp.h +++ b/arch/arm/include/asm/smp.h @@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask); +extern int register_ipi_completion(struct completion *completion, int cpu); + struct smp_operations { #ifdef CONFIG_SMP /* diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 72024ea..7d80a54 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -66,6 +66,7 @@ enum ipi_msg_type { IPI_CALL_FUNC, IPI_CALL_FUNC_SINGLE, IPI_CPU_STOP, + IPI_COMPLETION, }; static DECLARE_COMPLETION(cpu_running); @@ -456,6 +457,7 @@ static const char *ipi_types[NR_IPI] = { S(IPI_CALL_FUNC, "Function call interrupts"), S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), S(IPI_CPU_STOP, "CPU stop interrupts"), + S(IPI_COMPLETION, "completion interrupts"), }; void show_ipi_list(struct seq_file *p, int prec) @@ -515,6 +517,19 @@ static void ipi_cpu_stop(unsigned int cpu) cpu_relax(); } +static DEFINE_PER_CPU(struct completion *, cpu_completion); + +int register_ipi_completion(struct completion *completion, int cpu) +{ + per_cpu(cpu_completion, cpu) = completion; + return IPI_COMPLETION; +} + +static void ipi_complete(unsigned int cpu) +{ + complete(per_cpu(cpu_completion, cpu)); +} + /* * Main handler for inter-processor interrupts */ @@ -565,6 +580,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) irq_exit(); break; + case IPI_COMPLETION: + irq_enter(); + ipi_complete(cpu); + irq_exit(); + break; + default: printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); -- cgit v0.10.2 From de885d147ad2c4a66777e3557440247efde1cc8d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 Nov 2012 23:11:20 -0500 Subject: ARM: mcpm: add a simple poke mechanism to the early entry code This allows to poke a predetermined value into a specific address upon entering the early boot code in bL_head.S. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 370236d..4a2b32f 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); } +extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2]; + +void mcpm_set_early_poke(unsigned cpu, unsigned cluster, + unsigned long poke_phys_addr, unsigned long poke_val) +{ + unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0]; + poke[0] = poke_phys_addr; + poke[1] = poke_val; + __cpuc_flush_dcache_area((void *)poke, 8); + outer_clean_range(__pa(poke), __pa(poke + 2)); +} + static const struct mcpm_platform_ops *platform_ops; int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S index 39c96df..49dd535 100644 --- a/arch/arm/common/mcpm_head.S +++ b/arch/arm/common/mcpm_head.S @@ -71,12 +71,19 @@ ENTRY(mcpm_entry_point) * position independent way. */ adr r5, 3f - ldmia r5, {r6, r7, r8, r11} + ldmia r5, {r0, r6, r7, r8, r11} + add r0, r5, r0 @ r0 = mcpm_entry_early_pokes add r6, r5, r6 @ r6 = mcpm_entry_vectors ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys add r8, r5, r8 @ r8 = mcpm_sync add r11, r5, r11 @ r11 = first_man_locks + @ Perform an early poke, if any + add r0, r0, r4, lsl #3 + ldmia r0, {r0, r1} + teq r0, #0 + strne r1, [r0] + mov r0, #MCPM_SYNC_CLUSTER_SIZE mla r8, r0, r10, r8 @ r8 = sync cluster base @@ -195,7 +202,8 @@ mcpm_entry_gated: .align 2 -3: .word mcpm_entry_vectors - . +3: .word mcpm_entry_early_pokes - . + .word mcpm_entry_vectors - 3b .word mcpm_power_up_setup_phys - 3b .word mcpm_sync - 3b .word first_man_locks - 3b @@ -214,6 +222,10 @@ first_man_locks: ENTRY(mcpm_entry_vectors) .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER + .type mcpm_entry_early_pokes, #object +ENTRY(mcpm_entry_early_pokes) + .space 8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER + .type mcpm_power_up_setup_phys, #object ENTRY(mcpm_power_up_setup_phys) .space 4 @ set by mcpm_sync_init() diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index 0f7b762..7626a7f 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -42,6 +42,14 @@ extern void mcpm_entry_point(void); void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); /* + * This sets an early poke i.e a value to be poked into some address + * from very early assembly code before the CPU is ungated. The + * address must be physical, and if 0 then nothing will happen. + */ +void mcpm_set_early_poke(unsigned cpu, unsigned cluster, + unsigned long poke_phys_addr, unsigned long poke_val); + +/* * CPU/cluster power operations API for higher subsystems to use. */ -- cgit v0.10.2 From eeb446581ba23a5a36b4f5c7bfa2b1f8f7c9fb66 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 28 Nov 2012 18:17:25 -0500 Subject: ARM: GIC: function to retrieve the physical address of the SGIR In order to have early assembly code signal other CPUs in the system, we need to get the physical address for the SGIR register used to send IPIs. Because the register will be used with a precomputed CPU interface ID number, there is no need for any locking in the assembly code where this register is written to. Signed-off-by: Nicolas Pitre diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 6365b59..09fdf3d5 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -769,6 +769,33 @@ void gic_migrate_target(unsigned int new_cpu_id) } } } + +/* + * gic_get_sgir_physaddr - get the physical address for the SGI register + * + * REturn the physical address of the SGI register to be used + * by some early assembly code when the kernel is not yet available. + */ +static unsigned long gic_dist_physaddr; + +unsigned long gic_get_sgir_physaddr(void) +{ + if (!gic_dist_physaddr) + return 0; + return gic_dist_physaddr + GIC_DIST_SOFTINT; +} + +void __init gic_init_physaddr(struct device_node *node) +{ + struct resource res; + if (of_address_to_resource(node, 0, &res) == 0) { + gic_dist_physaddr = res.start; + pr_info("GIC physical location is %#lx\n", gic_dist_physaddr); + } +} + +#else +#define gic_init_physaddr(node) do { } while (0) #endif static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, @@ -952,6 +979,8 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent) percpu_offset = 0; gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); + if (!gic_cnt) + gic_init_physaddr(node); if (parent) { irq = irq_of_parse_and_map(node, 0); diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 46544e3..dc308350 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -78,6 +78,7 @@ static inline void gic_init(unsigned int nr, int start, int gic_get_cpu_id(unsigned int cpu); void gic_migrate_target(unsigned int new_cpu_id); +unsigned long gic_get_sgir_physaddr(void); #endif /* __ASSEMBLY */ -- cgit v0.10.2 From 14d2ca615a85e2dbc744c12c296affd35f119fa7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 28 Nov 2012 18:48:19 -0500 Subject: ARM: GIC: interface to send a SGI directly The regular gic_raise_softirq() takes as input a CPU mask which is not adequate when we need to send an IPI to a CPU which is not represented in the kernel to GIC mapping. That is the case with the b.L switcher when GIC migration to the inbound CPU has not yet occurred. Signed-off-by: Nicolas Pitre diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 09fdf3d5..9031171 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -674,6 +674,20 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) #ifdef CONFIG_BL_SWITCHER /* + * gic_send_sgi - send a SGI directly to given CPU interface number + * + * cpu_id: the ID for the destination CPU interface + * irq: the IPI number to send a SGI for + */ +void gic_send_sgi(unsigned int cpu_id, unsigned int irq) +{ + BUG_ON(cpu_id >= NR_GIC_CPU_IF); + cpu_id = 1 << cpu_id; + /* this always happens on GIC0 */ + writel_relaxed((cpu_id << 16) | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT); +} + +/* * gic_get_cpu_id - get the CPU interface ID for the specified CPU * * @cpu: the logical CPU number to get the GIC ID for. diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index dc308350..cac496b 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -76,6 +76,7 @@ static inline void gic_init(unsigned int nr, int start, gic_init_bases(nr, start, dist, cpu, 0, NULL); } +void gic_send_sgi(unsigned int cpu_id, unsigned int irq); int gic_get_cpu_id(unsigned int cpu); void gic_migrate_target(unsigned int new_cpu_id); unsigned long gic_get_sgir_physaddr(void); -- cgit v0.10.2 From 6137eba6c2b9bc2b7fd52e77741f50e43db4b5a6 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 13 Jun 2013 23:51:18 -0400 Subject: ARM: bL_switcher: wait until inbound is alive before performing a switch In some cases, a significant delay may be observed between the moment a request for a CPU to come up is made and the moment it is ready to start executing kernel code. This is especially true when a whole cluster has to be powered up which may take in the order of miliseconds. It is therefore a good idea to let the outbound CPU continue to execute code in the mean time, and be notified when the inbound is ready before performing the actual switch. This is achieved by registering a completion block with the appropriate IPI callback, and programming the sending of an IPI by the early assembly code prior to entering the main kernel code. Once the IPI is delivered to the outbound CPU, the completion block is "completed" and the switcher thread is resumed. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index aab7c12..dc53eb8 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -144,10 +144,11 @@ static int bL_switch_to(unsigned int new_cluster_id) { unsigned int mpidr, this_cpu, that_cpu; unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; + struct completion inbound_alive; struct tick_device *tdev; enum clock_event_mode tdev_mode; long volatile *handshake_ptr; - int ret; + int ipi_nr, ret; this_cpu = smp_processor_id(); ob_mpidr = read_mpidr(); @@ -166,10 +167,18 @@ static int bL_switch_to(unsigned int new_cluster_id) pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n", this_cpu, ob_mpidr, ib_mpidr); + this_cpu = smp_processor_id(); + /* Close the gate for our entry vectors */ mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL); mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL); + /* Install our "inbound alive" notifier. */ + init_completion(&inbound_alive); + ipi_nr = register_ipi_completion(&inbound_alive, this_cpu); + ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]); + mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr); + /* * Let's wake up the inbound CPU now in case it requires some delay * to come online, but leave it gated in our entry vector code. @@ -181,6 +190,19 @@ static int bL_switch_to(unsigned int new_cluster_id) } /* + * Raise a SGI on the inbound CPU to make sure it doesn't stall + * in a possible WFI, such as in bL_power_down(). + */ + gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0); + + /* + * Wait for the inbound to come up. This allows for other + * tasks to be scheduled in the mean time. + */ + wait_for_completion(&inbound_alive); + mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0); + + /* * From this point we are entering the switch critical zone * and can't take any interrupts anymore. */ @@ -190,12 +212,6 @@ static int bL_switch_to(unsigned int new_cluster_id) /* redirect GIC's SGIs to our counterpart */ gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]); - /* - * Raise a SGI on the inbound CPU to make sure it doesn't stall - * in a possible WFI, such as in mcpm_power_down(). - */ - arch_send_wakeup_ipi_mask(cpumask_of(this_cpu)); - tdev = tick_get_device(this_cpu); if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu))) tdev = NULL; -- cgit v0.10.2 From 1bfbddb6f3a0dbb8c3996d1c4d4911d695737c15 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 14 May 2012 17:40:07 +0100 Subject: ARM: bL_switcher: Basic trace events support This patch adds simple trace events to the b.L switcher code to allow tracing of CPU migration events. To make use of the trace events, you will need: CONFIG_FTRACE=y CONFIG_ENABLE_DEFAULT_TRACERS=y The following events are added: * power:cpu_migrate_begin * power:cpu_migrate_finish each with the following data: u64 timestamp; u32 cpu_hwid; power:cpu_migrate_begin occurs immediately before the switcher-specific migration operations start. power:cpu_migrate_finish occurs immediately when migration is completed. The cpu_hwid field contains the ID fields of the MPIDR. * For power:cpu_migrate_begin, cpu_hwid is the ID of the outbound physical CPU (equivalent to (from_phys_cpu,from_phys_cluster)). * For power:cpu_migrate_finish, cpu_hwid is the ID of the inbound physical CPU (equivalent to (to_phys_cpu,to_phys_cluster)). By design, the cpu_hwid field is masked in the same way as the device tree cpu node reg property, allowing direct correlation to the DT description of the hardware. The timestamp is added in order to minimise timing noise. An accurate system-wide clock should be used for generating this (hopefully getnstimeofday is appropriate, but it could be changed). It could be any monotonic shared clock, since the aim is to allow accurate deltas to be computed. We don't necessarily care about accurate synchronisation with wall clock time. In practice, each switch takes place on a single logical CPU, and the trace infrastructure should guarantee that events are well-ordered with respect to a single logical CPU. Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index dc53eb8..7002de3 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -33,10 +34,14 @@ #include #include +#include #include #include #include +#define CREATE_TRACE_POINTS +#include + /* * Use our own MPIDR accessors as the generic ones in asm/cputype.h have @@ -52,6 +57,16 @@ static int read_mpidr(void) } /* + * Get a global nanosecond time stamp for tracing. + */ +static s64 get_ns(void) +{ + struct timespec ts; + getnstimeofday(&ts); + return timespec_to_ns(&ts); +} + +/* * bL switcher core code. */ @@ -208,6 +223,7 @@ static int bL_switch_to(unsigned int new_cluster_id) */ local_irq_disable(); local_fiq_disable(); + trace_cpu_migrate_begin(get_ns(), ob_mpidr); /* redirect GIC's SGIs to our counterpart */ gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]); @@ -250,6 +266,7 @@ static int bL_switch_to(unsigned int new_cluster_id) tdev->evtdev->next_event, 1); } + trace_cpu_migrate_finish(get_ns(), ib_mpidr); local_fiq_enable(); local_irq_enable(); diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h new file mode 100644 index 0000000..3694af0 --- /dev/null +++ b/include/trace/events/power_cpu_migrate.h @@ -0,0 +1,66 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM power + +#if !defined(_TRACE_POWER_CPU_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_POWER_CPU_MIGRATE_H + +#include + +#define __cpu_migrate_proto \ + TP_PROTO(u64 timestamp, \ + u32 cpu_hwid) +#define __cpu_migrate_args \ + TP_ARGS(timestamp, \ + cpu_hwid) + +DECLARE_EVENT_CLASS(cpu_migrate, + + __cpu_migrate_proto, + __cpu_migrate_args, + + TP_STRUCT__entry( + __field(u64, timestamp ) + __field(u32, cpu_hwid ) + ), + + TP_fast_assign( + __entry->timestamp = timestamp; + __entry->cpu_hwid = cpu_hwid; + ), + + TP_printk("timestamp=%llu cpu_hwid=0x%08lX", + (unsigned long long)__entry->timestamp, + (unsigned long)__entry->cpu_hwid + ) +); + +#define __define_cpu_migrate_event(name) \ + DEFINE_EVENT(cpu_migrate, cpu_migrate_##name, \ + __cpu_migrate_proto, \ + __cpu_migrate_args \ + ) + +__define_cpu_migrate_event(begin); +__define_cpu_migrate_event(finish); + +#undef __define_cpu_migrate +#undef __cpu_migrate_proto +#undef __cpu_migrate_args + +/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */ +#ifndef _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING +#define _PWR_CPU_MIGRATE_EVENT_AVOID_DOUBLE_DEFINING + +/* + * Set from_phys_cpu and to_phys_cpu to CPU_MIGRATE_ALL_CPUS to indicate + * a whole-cluster migration: + */ +#define CPU_MIGRATE_ALL_CPUS 0x80000000U +#endif + +#endif /* _TRACE_POWER_CPU_MIGRATE_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE power_cpu_migrate +#include -- cgit v0.10.2 From b09bbe5b1267b6af22a9584d614f5eec5d74f405 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 6 Feb 2013 15:45:23 +0000 Subject: ARM: bL_switcher/trace: Add trace trigger for trace bootstrapping When tracing switching, an external tracer needs a way to bootstrap its knowledge of the logical<->physical CPU mapping. This patch adds a sysfs attribute trace_trigger. A write to this attribute will generate a power:cpu_migrate_current event for each online CPU, indicating the current physical CPU for each logical CPU. Activating or deactivating the switcher also generates these events, so that the tracer knows about the resulting remapping of affected CPUs. Signed-off-by: Dave Martin diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 7002de3..f0dc025 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -531,6 +532,25 @@ static int bL_switcher_halve_cpus(void) return 0; } +static void bL_switcher_trace_trigger_cpu(void *__always_unused info) +{ + trace_cpu_migrate_current(get_ns(), read_mpidr()); +} + +static int bL_switcher_trace_trigger(void) +{ + int ret; + + preempt_disable(); + + bL_switcher_trace_trigger_cpu(NULL); + ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true); + + preempt_enable(); + + return ret; +} + static int bL_switcher_enable(void) { int cpu, ret; @@ -553,6 +573,8 @@ static int bL_switcher_enable(void) if (ret) goto error; + bL_switcher_trace_trigger(); + for_each_online_cpu(cpu) { struct bL_thread *t = &bL_threads[cpu]; spin_lock_init(&t->lock); @@ -637,6 +659,8 @@ static void bL_switcher_disable(void) } bL_switcher_restore_cpus(); + bL_switcher_trace_trigger(); + bL_activation_notify(BL_NOTIFY_POST_DISABLE); out: @@ -670,11 +694,23 @@ static ssize_t bL_switcher_active_store(struct kobject *kobj, return (ret >= 0) ? count : ret; } +static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret = bL_switcher_trace_trigger(); + + return ret ? ret : count; +} + static struct kobj_attribute bL_switcher_active_attr = __ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store); +static struct kobj_attribute bL_switcher_trace_trigger_attr = + __ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store); + static struct attribute *bL_switcher_attrs[] = { &bL_switcher_active_attr.attr, + &bL_switcher_trace_trigger_attr.attr, NULL, }; diff --git a/include/trace/events/power_cpu_migrate.h b/include/trace/events/power_cpu_migrate.h index 3694af0..f76dd4d 100644 --- a/include/trace/events/power_cpu_migrate.h +++ b/include/trace/events/power_cpu_migrate.h @@ -42,6 +42,7 @@ DECLARE_EVENT_CLASS(cpu_migrate, __define_cpu_migrate_event(begin); __define_cpu_migrate_event(finish); +__define_cpu_migrate_event(current); #undef __define_cpu_migrate #undef __cpu_migrate_proto -- cgit v0.10.2 From 29064b88466ee725613db16d8c05b0ec5443a309 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 11 Feb 2013 14:39:19 +0000 Subject: ARM: bL_switcher/trace: Add kernel trace trigger interface This patch exports a bL_switcher_trace_trigger() function to provide a means for drivers using the trace events to get the current status when starting a trace session. Calling this function is equivalent to pinging the trace_trigger file in sysfs. Signed-off-by: Dave Martin diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index f0dc025..f4878a3 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -537,7 +537,7 @@ static void bL_switcher_trace_trigger_cpu(void *__always_unused info) trace_cpu_migrate_current(get_ns(), read_mpidr()); } -static int bL_switcher_trace_trigger(void) +int bL_switcher_trace_trigger(void) { int ret; @@ -550,6 +550,7 @@ static int bL_switcher_trace_trigger(void) return ret; } +EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger); static int bL_switcher_enable(void) { diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index 7d1cce8..8ada5a88 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -54,6 +54,8 @@ int bL_switcher_unregister_notifier(struct notifier_block *nb); bool bL_switcher_get_enabled(void); void bL_switcher_put_enabled(void); +int bL_switcher_trace_trigger(void); + #else static inline int bL_switcher_register_notifier(struct notifier_block *nb) { @@ -67,6 +69,7 @@ static inline int bL_switcher_unregister_notifier(struct notifier_block *nb) static inline bool bL_switcher_get_enabled(void) { return false; } static inline void bL_switcher_put_enabled(void) { } +static inline int bL_switcher_trace_trigger(void) { return 0; } #endif /* CONFIG_BL_SWITCHER */ #endif -- cgit v0.10.2 From d08e2e09042bd3f7ef66a35cb4bb92794ab26bb2 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 13 Feb 2013 16:20:44 +0000 Subject: ARM: bL_switcher: Add query interface to discover CPU affinities When the switcher is active, there is no straightforward way to figure out which logical CPU a given physical CPU maps to. This patch provides a function bL_switcher_get_logical_index(mpidr), which is analogous to get_logical_index(). This function returns the logical CPU on which the specified physical CPU is grouped (or -EINVAL if unknown). If the switcher is inactive or not present, -EUNATCH is returned instead. Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index f4878a3..63bbc4f 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -532,6 +532,26 @@ static int bL_switcher_halve_cpus(void) return 0; } +/* Determine the logical CPU a given physical CPU is grouped on. */ +int bL_switcher_get_logical_index(u32 mpidr) +{ + int cpu; + + if (!bL_switcher_active) + return -EUNATCH; + + mpidr &= MPIDR_HWID_BITMASK; + for_each_online_cpu(cpu) { + int pairing = bL_switcher_cpu_pairing[cpu]; + if (pairing == -1) + continue; + if ((mpidr == cpu_logical_map(cpu)) || + (mpidr == cpu_logical_map(pairing))) + return cpu; + } + return -EINVAL; +} + static void bL_switcher_trace_trigger_cpu(void *__always_unused info) { trace_cpu_migrate_current(get_ns(), read_mpidr()); diff --git a/arch/arm/include/asm/bL_switcher.h b/arch/arm/include/asm/bL_switcher.h index 8ada5a88..1714800 100644 --- a/arch/arm/include/asm/bL_switcher.h +++ b/arch/arm/include/asm/bL_switcher.h @@ -55,6 +55,7 @@ bool bL_switcher_get_enabled(void); void bL_switcher_put_enabled(void); int bL_switcher_trace_trigger(void); +int bL_switcher_get_logical_index(u32 mpidr); #else static inline int bL_switcher_register_notifier(struct notifier_block *nb) @@ -70,6 +71,7 @@ static inline int bL_switcher_unregister_notifier(struct notifier_block *nb) static inline bool bL_switcher_get_enabled(void) { return false; } static inline void bL_switcher_put_enabled(void) { } static inline int bL_switcher_trace_trigger(void) { return 0; } +static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; } #endif /* CONFIG_BL_SWITCHER */ #endif -- cgit v0.10.2 From 49863894db3ed7bd41541b1c17733273966cea71 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 26 Sep 2013 12:36:35 +0100 Subject: ARM: perf: add support for perf registers API This patch implements the functions required for the perf registers API, allowing the perf tool to interface kernel register dumps with libunwind in order to provide userspace backtracing. Cc: Jean Pihet Signed-off-by: Will Deacon diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1ad6fb6..899d0c6 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -51,6 +51,8 @@ config ARM select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index 18d76fd..70a1c9d 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -7,6 +7,7 @@ header-y += hwcap.h header-y += ioctls.h header-y += kvm_para.h header-y += mman.h +header-y += perf_regs.h header-y += posix_types.h header-y += ptrace.h header-y += setup.h diff --git a/arch/arm/include/uapi/asm/perf_regs.h b/arch/arm/include/uapi/asm/perf_regs.h new file mode 100644 index 0000000..ce59448 --- /dev/null +++ b/arch/arm/include/uapi/asm/perf_regs.h @@ -0,0 +1,23 @@ +#ifndef _ASM_ARM_PERF_REGS_H +#define _ASM_ARM_PERF_REGS_H + +enum perf_event_arm_regs { + PERF_REG_ARM_R0, + PERF_REG_ARM_R1, + PERF_REG_ARM_R2, + PERF_REG_ARM_R3, + PERF_REG_ARM_R4, + PERF_REG_ARM_R5, + PERF_REG_ARM_R6, + PERF_REG_ARM_R7, + PERF_REG_ARM_R8, + PERF_REG_ARM_R9, + PERF_REG_ARM_R10, + PERF_REG_ARM_FP, + PERF_REG_ARM_IP, + PERF_REG_ARM_SP, + PERF_REG_ARM_LR, + PERF_REG_ARM_PC, + PERF_REG_ARM_MAX, +}; +#endif /* _ASM_ARM_PERF_REGS_H */ diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 5140df5f..9b818ca 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o obj-$(CONFIG_IWMMXT) += iwmmxt.o +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_HW_PERF_EVENTS) += perf_event.o perf_event_cpu.o AFLAGS_iwmmxt.o := -Wa,-mcpu=iwmmxt obj-$(CONFIG_ARM_CPU_TOPOLOGY) += topology.o diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c new file mode 100644 index 0000000..6e4379c --- /dev/null +++ b/arch/arm/kernel/perf_regs.c @@ -0,0 +1,30 @@ + +#include +#include +#include +#include +#include +#include + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX)) + return 0; + + return regs->uregs[idx]; +} + +#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} -- cgit v0.10.2 From 7495f3742dda97612a77d92fa62f85cb7591ab14 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 26 Sep 2013 12:36:36 +0100 Subject: ARM: perf: wire up perf_regs and unwind support for ARM This patch hooks in the perf_regs and libunwind code for ARM. Cc: Jean Pihet Signed-off-by: Will Deacon diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile index 15130b5..fe9b61e 100644 --- a/tools/perf/arch/arm/Makefile +++ b/tools/perf/arch/arm/Makefile @@ -2,3 +2,6 @@ ifndef NO_DWARF PERF_HAVE_DWARF_REGS := 1 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif +ifndef NO_LIBUNWIND +LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind.o +endif diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h new file mode 100644 index 0000000..2a1cfde --- /dev/null +++ b/tools/perf/arch/arm/include/perf_regs.h @@ -0,0 +1,54 @@ +#ifndef ARCH_PERF_REGS_H +#define ARCH_PERF_REGS_H + +#include +#include "../../util/types.h" +#include + +#define PERF_REGS_MASK ((1ULL << PERF_REG_ARM_MAX) - 1) +#define PERF_REG_IP PERF_REG_ARM_PC +#define PERF_REG_SP PERF_REG_ARM_SP + +static inline const char *perf_reg_name(int id) +{ + switch (id) { + case PERF_REG_ARM_R0: + return "r0"; + case PERF_REG_ARM_R1: + return "r1"; + case PERF_REG_ARM_R2: + return "r2"; + case PERF_REG_ARM_R3: + return "r3"; + case PERF_REG_ARM_R4: + return "r4"; + case PERF_REG_ARM_R5: + return "r5"; + case PERF_REG_ARM_R6: + return "r6"; + case PERF_REG_ARM_R7: + return "r7"; + case PERF_REG_ARM_R8: + return "r8"; + case PERF_REG_ARM_R9: + return "r9"; + case PERF_REG_ARM_R10: + return "r10"; + case PERF_REG_ARM_FP: + return "fp"; + case PERF_REG_ARM_IP: + return "ip"; + case PERF_REG_ARM_SP: + return "sp"; + case PERF_REG_ARM_LR: + return "lr"; + case PERF_REG_ARM_PC: + return "pc"; + default: + return NULL; + } + + return NULL; +} + +#endif /* ARCH_PERF_REGS_H */ diff --git a/tools/perf/arch/arm/util/unwind.c b/tools/perf/arch/arm/util/unwind.c new file mode 100644 index 0000000..da3dc95 --- /dev/null +++ b/tools/perf/arch/arm/util/unwind.c @@ -0,0 +1,48 @@ + +#include +#include +#include "perf_regs.h" +#include "../../util/unwind.h" + +int unwind__arch_reg_id(int regnum) +{ + switch (regnum) { + case UNW_ARM_R0: + return PERF_REG_ARM_R0; + case UNW_ARM_R1: + return PERF_REG_ARM_R1; + case UNW_ARM_R2: + return PERF_REG_ARM_R2; + case UNW_ARM_R3: + return PERF_REG_ARM_R3; + case UNW_ARM_R4: + return PERF_REG_ARM_R4; + case UNW_ARM_R5: + return PERF_REG_ARM_R5; + case UNW_ARM_R6: + return PERF_REG_ARM_R6; + case UNW_ARM_R7: + return PERF_REG_ARM_R7; + case UNW_ARM_R8: + return PERF_REG_ARM_R8; + case UNW_ARM_R9: + return PERF_REG_ARM_R9; + case UNW_ARM_R10: + return PERF_REG_ARM_R10; + case UNW_ARM_R11: + return PERF_REG_ARM_FP; + case UNW_ARM_R12: + return PERF_REG_ARM_IP; + case UNW_ARM_R13: + return PERF_REG_ARM_SP; + case UNW_ARM_R14: + return PERF_REG_ARM_LR; + case UNW_ARM_R15: + return PERF_REG_ARM_PC; + default: + pr_err("unwind: invalid reg id %d\n", regnum); + return -EINVAL; + } + + return -EINVAL; +} diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 5f6f9b3..4796ce5 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -29,6 +29,10 @@ ifeq ($(ARCH),x86_64) NO_PERF_REGS := 0 LIBUNWIND_LIBS = -lunwind -lunwind-x86_64 endif +ifeq ($(ARCH),arm) + NO_PERF_REGS := 0 + LIBUNWIND_LIBS = -lunwind -lunwind-arm +endif ifeq ($(NO_PERF_REGS),0) CFLAGS += -DHAVE_PERF_REGS @@ -208,8 +212,7 @@ ifeq ($(call try-cc,$(SOURCE_ELF_MMAP),$(FLAGS_LIBELF),-DLIBELF_MMAP),y) endif # try-cc endif # NO_LIBELF -# There's only x86 (both 32 and 64) support for CFI unwind so far -ifneq ($(ARCH),x86) +ifeq ($(LIBUNWIND_LIBS),) NO_LIBUNWIND := 1 endif -- cgit v0.10.2 From 405ffbd4988118c56ff127792ebddae77d2c7a43 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Thu, 26 Sep 2013 12:36:37 +0100 Subject: perf tools: Check libunwind for availability of dwarf parsing feature The newly added dwarf unwinding feature [1] requires: . a recent version (>= 1.1) of libunwind, . libunwind to be configured with --enable-debug-frame. [1] http://www.spinics.net/lists/kernel/msg1598951.html Add the corresponding API tests in the feature check list. Acked-by: Jiri Olsa Signed-off-by: Jean Pihet Signed-off-by: Will Deacon diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 4796ce5..75b93d7 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -226,9 +226,13 @@ endif FLAGS_UNWIND=$(LIBUNWIND_CFLAGS) $(CFLAGS) $(LIBUNWIND_LDFLAGS) $(LDFLAGS) $(EXTLIBS) $(LIBUNWIND_LIBS) ifneq ($(call try-cc,$(SOURCE_LIBUNWIND),$(FLAGS_UNWIND),libunwind),y) - msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 0.99); + msg := $(warning No libunwind found, disabling post unwind support. Please install libunwind-dev[el] >= 1.1); NO_LIBUNWIND := 1 endif # Libunwind support +ifneq ($(call try-cc,$(SOURCE_LIBUNWIND_DEBUG_FRAME),$(FLAGS_UNWIND),libunwind debug_frame),y) + msg := $(warning No debug_frame support found in libunwind); +CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME +endif # debug_frame support in libunwind endif # NO_LIBUNWIND ifndef NO_LIBUNWIND diff --git a/tools/perf/config/feature-tests.mak b/tools/perf/config/feature-tests.mak index d5a8dd4..40b21c0 100644 --- a/tools/perf/config/feature-tests.mak +++ b/tools/perf/config/feature-tests.mak @@ -185,7 +185,6 @@ extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, unw_proc_info_t *pi, int need_unwind_info, void *arg); - #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) int main(void) @@ -197,6 +196,26 @@ int main(void) return 0; } endef + +define SOURCE_LIBUNWIND_DEBUG_FRAME +#include +#include + +extern int +UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, + unw_word_t ip, unw_word_t segbase, + const char *obj_name, unw_word_t start, + unw_word_t end); + +#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) + +int main(void) +{ + dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0); + return 0; +} +endef + endif ifndef NO_BACKTRACE -- cgit v0.10.2 From ab255e72204fc5127f0adf40ba103d9335fefa30 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Thu, 26 Sep 2013 12:36:38 +0100 Subject: perf: parse the .debug_frame section in case .eh_frame is not present On ARM the debug info is not present in the .eh_frame sections but in .debug_frame instead, in dwarf format. Use libunwind to load and parse the debug info. Dependencies: . if present, libunwind >= 1.1 is needed to prevent a segfault when parsing the dwarf info, . libunwind needs to be configured with --enable-debug-frame. Note: --enable-debug-frame is automatically selected on ARM. Acked-by: Jiri Olsa Signed-off-by: Jean Pihet Signed-off-by: Will Deacon diff --git a/tools/perf/util/unwind.c b/tools/perf/util/unwind.c index 2f891f7..5390d0b 100644 --- a/tools/perf/util/unwind.c +++ b/tools/perf/util/unwind.c @@ -39,6 +39,15 @@ UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as, #define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table) +extern int +UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug, + unw_word_t ip, + unw_word_t segbase, + const char *obj_name, unw_word_t start, + unw_word_t end); + +#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame) + #define DW_EH_PE_FORMAT_MASK 0x0f /* format of the encoded value */ #define DW_EH_PE_APPL_MASK 0x70 /* how the value is to be applied */ @@ -245,8 +254,9 @@ static int unwind_spec_ehframe(struct dso *dso, struct machine *machine, return 0; } -static int read_unwind_spec(struct dso *dso, struct machine *machine, - u64 *table_data, u64 *segbase, u64 *fde_count) +static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine, + u64 *table_data, u64 *segbase, + u64 *fde_count) { int ret = -EINVAL, fd; u64 offset; @@ -255,6 +265,7 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, if (fd < 0) return -EINVAL; + /* Check the .eh_frame section for unwinding info */ offset = elf_section_offset(fd, ".eh_frame_hdr"); close(fd); @@ -263,10 +274,29 @@ static int read_unwind_spec(struct dso *dso, struct machine *machine, table_data, segbase, fde_count); - /* TODO .debug_frame check if eh_frame_hdr fails */ return ret; } +#ifndef NO_LIBUNWIND_DEBUG_FRAME +static int read_unwind_spec_debug_frame(struct dso *dso, + struct machine *machine, u64 *offset) +{ + int fd = dso__data_fd(dso, machine); + + if (fd < 0) + return -EINVAL; + + /* Check the .debug_frame section for unwinding info */ + *offset = elf_section_offset(fd, ".debug_frame"); + close(fd); + + if (*offset) + return 0; + + return -EINVAL; +} +#endif + static struct map *find_map(unw_word_t ip, struct unwind_info *ui) { struct addr_location al; @@ -291,20 +321,33 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi, pr_debug("unwind: find_proc_info dso %s\n", map->dso->name); - if (read_unwind_spec(map->dso, ui->machine, - &table_data, &segbase, &fde_count)) - return -EINVAL; + /* Check the .eh_frame section for unwinding info */ + if (!read_unwind_spec_eh_frame(map->dso, ui->machine, + &table_data, &segbase, &fde_count)) { + memset(&di, 0, sizeof(di)); + di.format = UNW_INFO_FORMAT_REMOTE_TABLE; + di.start_ip = map->start; + di.end_ip = map->end; + di.u.rti.segbase = map->start + segbase; + di.u.rti.table_data = map->start + table_data; + di.u.rti.table_len = fde_count * sizeof(struct table_entry) + / sizeof(unw_word_t); + return dwarf_search_unwind_table(as, ip, &di, pi, + need_unwind_info, arg); + } + +#ifndef NO_LIBUNWIND_DEBUG_FRAME + /* Check the .debug_frame section for unwinding info */ + if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) { + memset(&di, 0, sizeof(di)); + dwarf_find_debug_frame(0, &di, ip, 0, map->dso->name, + map->start, map->end); + return dwarf_search_unwind_table(as, ip, &di, pi, + need_unwind_info, arg); + } +#endif - memset(&di, 0, sizeof(di)); - di.format = UNW_INFO_FORMAT_REMOTE_TABLE; - di.start_ip = map->start; - di.end_ip = map->end; - di.u.rti.segbase = map->start + segbase; - di.u.rti.table_data = map->start + table_data; - di.u.rti.table_len = fde_count * sizeof(struct table_entry) - / sizeof(unw_word_t); - return dwarf_search_unwind_table(as, ip, &di, pi, - need_unwind_info, arg); + return -EINVAL; } static int access_fpreg(unw_addr_space_t __maybe_unused as, -- cgit v0.10.2 From e744dff72bcd4d9588af91e1feb662702222ca12 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 26 Jun 2013 16:47:35 +0100 Subject: ARM: prefetch: remove redundant "cc" clobber The pld instruction does not affect the condition flags, so don't bother clobbering them. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 413f387..514a989 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -97,9 +97,7 @@ static inline void prefetch(const void *ptr) { __asm__ __volatile__( "pld\t%a0" - : - : "p" (ptr) - : "cc"); + :: "p" (ptr)); } #define ARCH_HAS_PREFETCHW -- cgit v0.10.2 From 27a84793e42084392181ef2ef51a954f1cf0c519 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2013 12:10:42 +0100 Subject: ARM: smp_on_up: move inline asm ALT_SMP patching macro out of spinlock.h Patching UP/SMP alternatives inside inline assembly blocks is useful outside of the spinlock implementation, where it is used for sev and wfe. This patch lifts the macro into processor.h and gives it a scarier name to (a) avoid conflicts in the global namespace and (b) to try and deter its usage unless you "know what you're doing". The W macro for generating wide instructions when targetting Thumb-2 is also made available under the name WASM, to reduce the potential for conflicts with other headers. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 514a989..26164c9 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef __KERNEL__ #define STACK_TOP ((current->personality & ADDR_LIMIT_32BIT) ? \ @@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) task_pt_regs(tsk)->ARM_pc #define KSTK_ESP(tsk) task_pt_regs(tsk)->ARM_sp +#ifdef CONFIG_SMP +#define __ALT_SMP_ASM(smp, up) \ + "9998: " smp "\n" \ + " .pushsection \".alt.smp.init\", \"a\"\n" \ + " .long 9998b\n" \ + " " up "\n" \ + " .popsection\n" +#else +#define __ALT_SMP_ASM(smp, up) up +#endif + /* * Prefetching support - only ARMv5. */ diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4f2c280..e1ce452 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -11,15 +11,7 @@ * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K * extensions, so when running on UP, we have to patch these instructions away. */ -#define ALT_SMP(smp, up) \ - "9998: " smp "\n" \ - " .pushsection \".alt.smp.init\", \"a\"\n" \ - " .long 9998b\n" \ - " " up "\n" \ - " .popsection\n" - #ifdef CONFIG_THUMB2_KERNEL -#define SEV ALT_SMP("sev.w", "nop.w") /* * For Thumb-2, special care is needed to ensure that the conditional WFE * instruction really does assemble to exactly 4 bytes (as required by @@ -31,17 +23,18 @@ * the assembler won't change IT instructions which are explicitly present * in the input. */ -#define WFE(cond) ALT_SMP( \ +#define WFE(cond) __ALT_SMP_ASM( \ "it " cond "\n\t" \ "wfe" cond ".n", \ \ "nop.w" \ ) #else -#define SEV ALT_SMP("sev", "nop") -#define WFE(cond) ALT_SMP("wfe" cond, "nop") +#define WFE(cond) __ALT_SMP_ASM("wfe" cond, "nop") #endif +#define SEV __ALT_SMP_ASM(WASM(sev), WASM(nop)) + static inline void dsb_sev(void) { #if __LINUX_ARM_ARCH__ >= 7 diff --git a/arch/arm/include/asm/unified.h b/arch/arm/include/asm/unified.h index f5989f4..b88beab 100644 --- a/arch/arm/include/asm/unified.h +++ b/arch/arm/include/asm/unified.h @@ -38,6 +38,8 @@ #ifdef __ASSEMBLY__ #define W(instr) instr.w #define BSYM(sym) sym + 1 +#else +#define WASM(instr) #instr ".w" #endif #else /* !CONFIG_THUMB2_KERNEL */ @@ -50,6 +52,8 @@ #ifdef __ASSEMBLY__ #define W(instr) instr #define BSYM(sym) sym +#else +#define WASM(instr) #instr #endif #endif /* CONFIG_THUMB2_KERNEL */ -- cgit v0.10.2 From d8f57aa4bc5860df68d4c332d2a89c131417ee7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 26 Jun 2013 17:03:40 +0100 Subject: ARM: prefetch: add support for prefetchw using pldw on SMP ARMv7+ CPUs SMP ARMv7 CPUs implement the pldw instruction, which allows them to prefetch data cachelines in an exclusive state. This patch defines the prefetchw macro using pldw for CPUs that support it. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 26164c9..c3d5fc1 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -112,12 +112,19 @@ static inline void prefetch(const void *ptr) :: "p" (ptr)); } +#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) #define ARCH_HAS_PREFETCHW -#define prefetchw(ptr) prefetch(ptr) - -#define ARCH_HAS_SPINLOCK_PREFETCH -#define spin_lock_prefetch(x) do { } while (0) - +static inline void prefetchw(const void *ptr) +{ + __asm__ __volatile__( + ".arch_extension mp\n" + __ALT_SMP_ASM( + WASM(pldw) "\t%a0", + WASM(pld) "\t%a0" + ) + :: "p" (ptr)); +} +#endif #endif #define HAVE_ARCH_PICK_MMAP_LAYOUT -- cgit v0.10.2 From 9bb17be062de6f5a9c9643258951aa0935652ec3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Jul 2013 14:54:33 +0100 Subject: ARM: locks: prefetch the destination word for write prior to strex The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch prefixes our {spin,read,write}_[try]lock implementations with pldw instructions (on CPUs which support them) to try and grab the line in exclusive state from the start. arch_rwlock_t is changed to avoid using a volatile member, since this generates compiler warnings when falling back on the __builtin_prefetch intrinsic which expects a const void * argument. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index e1ce452..4999007 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -5,7 +5,7 @@ #error SMP not supported on pre-ARMv6 CPUs #endif -#include +#include /* * sev and wfe are ARMv6K extensions. Uniprocessor ARMv6 may not have the K @@ -70,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) u32 newval; arch_spinlock_t lockval; + prefetchw(&lock->slock); __asm__ __volatile__( "1: ldrex %0, [%3]\n" " add %1, %0, %4\n" @@ -93,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) unsigned long contended, res; u32 slock; + prefetchw(&lock->slock); do { __asm__ __volatile__( " ldrex %0, [%3]\n" @@ -145,6 +147,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) { unsigned long tmp; + prefetchw(&rw->lock); __asm__ __volatile__( "1: ldrex %0, [%1]\n" " teq %0, #0\n" @@ -163,6 +166,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw) { unsigned long contended, res; + prefetchw(&rw->lock); do { __asm__ __volatile__( " ldrex %0, [%2]\n" @@ -196,7 +200,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) } /* write_can_lock - would write_trylock() succeed? */ -#define arch_write_can_lock(x) ((x)->lock == 0) +#define arch_write_can_lock(x) (ACCESS_ONCE((x)->lock) == 0) /* * Read locks are a bit more hairy: @@ -214,6 +218,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw) { unsigned long tmp, tmp2; + prefetchw(&rw->lock); __asm__ __volatile__( "1: ldrex %0, [%2]\n" " adds %0, %0, #1\n" @@ -234,6 +239,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw) smp_mb(); + prefetchw(&rw->lock); __asm__ __volatile__( "1: ldrex %0, [%2]\n" " sub %0, %0, #1\n" @@ -252,6 +258,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) { unsigned long contended, res; + prefetchw(&rw->lock); do { __asm__ __volatile__( " ldrex %0, [%2]\n" @@ -273,7 +280,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) } /* read_can_lock - would read_trylock() succeed? */ -#define arch_read_can_lock(x) ((x)->lock < 0x80000000) +#define arch_read_can_lock(x) (ACCESS_ONCE((x)->lock) < 0x80000000) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index b262d2f..47663fc 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -25,7 +25,7 @@ typedef struct { #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } typedef struct { - volatile unsigned int lock; + u32 lock; } arch_rwlock_t; #define __ARCH_RW_LOCK_UNLOCKED { 0 } -- cgit v0.10.2 From f38d999c4d16fc0fce4270374f15fbb2d8713c09 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 4 Jul 2013 11:43:18 +0100 Subject: ARM: atomics: prefetch the destination word for write prior to strex The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch prefixes our atomic access implementations with pldw instructions (on CPUs which support them) to try and grab the line in exclusive state from the start. Only the barrier-less functions are updated, since memory barriers can limit the usefulness of prefetching data. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index da1c77d..55ffc3b 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -12,6 +12,7 @@ #define __ASM_ARM_ATOMIC_H #include +#include #include #include #include @@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v) unsigned long tmp; int result; + prefetchw(&v->counter); __asm__ __volatile__("@ atomic_add\n" "1: ldrex %0, [%3]\n" " add %0, %0, %4\n" @@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v) unsigned long tmp; int result; + prefetchw(&v->counter); __asm__ __volatile__("@ atomic_sub\n" "1: ldrex %0, [%3]\n" " sub %0, %0, %4\n" @@ -138,6 +141,7 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) { unsigned long tmp, tmp2; + prefetchw(addr); __asm__ __volatile__("@ atomic_clear_mask\n" "1: ldrex %0, [%3]\n" " bic %0, %0, %4\n" @@ -283,6 +287,7 @@ static inline void atomic64_set(atomic64_t *v, u64 i) { u64 tmp; + prefetchw(&v->counter); __asm__ __volatile__("@ atomic64_set\n" "1: ldrexd %0, %H0, [%2]\n" " strexd %0, %3, %H3, [%2]\n" @@ -299,6 +304,7 @@ static inline void atomic64_add(u64 i, atomic64_t *v) u64 result; unsigned long tmp; + prefetchw(&v->counter); __asm__ __volatile__("@ atomic64_add\n" "1: ldrexd %0, %H0, [%3]\n" " adds %0, %0, %4\n" @@ -339,6 +345,7 @@ static inline void atomic64_sub(u64 i, atomic64_t *v) u64 result; unsigned long tmp; + prefetchw(&v->counter); __asm__ __volatile__("@ atomic64_sub\n" "1: ldrexd %0, %H0, [%3]\n" " subs %0, %0, %4\n" -- cgit v0.10.2 From d779c07dd72098a7416d907494f958213b7726f3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 27 Jun 2013 12:01:51 +0100 Subject: ARM: bitops: prefetch the destination word for write prior to strex The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch prefixes our atomic bitops implementation with prefetchw, to try and grab the line in exclusive state from the start. The testop macro is left alone, since the barrier semantics limit the usefulness of prefetching data. Acked-by: Nicolas Pitre Signed-off-by: Will Deacon diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index d6408d1..e0c68d5 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -10,6 +10,11 @@ UNWIND( .fnstart ) and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset +#if __LINUX_ARM_ARCH__ >= 7 + .arch_extension mp + ALT_SMP(W(pldw) [r1]) + ALT_UP(W(nop)) +#endif mov r3, r2, lsl r3 1: ldrex r2, [r1] \instr r2, r2, r3 -- cgit v0.10.2 From cf154b7e22e9f6714f0a806c023c329b890132a2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 20 Sep 2013 09:57:37 +0200 Subject: ARM: pull in from asm-generic Signed-off-by: Ard Biesheuvel diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index d3db398..6577b8a 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h generic-y += siginfo.h +generic-y += simd.h generic-y += sizes.h generic-y += socket.h generic-y += sockios.h -- cgit v0.10.2 From 5ce26f3b5ae8fafdd28375a004f7b8e924e9bacb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 15 Sep 2013 17:10:43 +0200 Subject: ARM: move AES typedefs and function prototypes to separate header Put the struct definitions for AES keys and the asm function prototypes in a separate header and export the asm functions from the module. This allows other drivers to use them directly. Signed-off-by: Ard Biesheuvel diff --git a/arch/arm/crypto/aes_glue.c b/arch/arm/crypto/aes_glue.c index 59f7877..3003fa1 100644 --- a/arch/arm/crypto/aes_glue.c +++ b/arch/arm/crypto/aes_glue.c @@ -6,22 +6,12 @@ #include #include -#define AES_MAXNR 14 +#include "aes_glue.h" -typedef struct { - unsigned int rd_key[4 *(AES_MAXNR + 1)]; - int rounds; -} AES_KEY; - -struct AES_CTX { - AES_KEY enc_key; - AES_KEY dec_key; -}; - -asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx); -asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx); -asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); -asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); +EXPORT_SYMBOL(AES_encrypt); +EXPORT_SYMBOL(AES_decrypt); +EXPORT_SYMBOL(private_AES_set_encrypt_key); +EXPORT_SYMBOL(private_AES_set_decrypt_key); static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { @@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = { .cipher = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, + .cia_setkey = aes_set_key, .cia_encrypt = aes_encrypt, .cia_decrypt = aes_decrypt } diff --git a/arch/arm/crypto/aes_glue.h b/arch/arm/crypto/aes_glue.h new file mode 100644 index 0000000..cca3e51 --- /dev/null +++ b/arch/arm/crypto/aes_glue.h @@ -0,0 +1,19 @@ + +#define AES_MAXNR 14 + +struct AES_KEY { + unsigned int rd_key[4 * (AES_MAXNR + 1)]; + int rounds; +}; + +struct AES_CTX { + struct AES_KEY enc_key; + struct AES_KEY dec_key; +}; + +asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx); +asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx); +asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, + const int bits, struct AES_KEY *key); +asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, + const int bits, struct AES_KEY *key); -- cgit v0.10.2 From e4e7f10bfc4069925e99cc4b428c3434e30b6c3f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 16 Sep 2013 18:31:38 +0200 Subject: ARM: add support for bit sliced AES using NEON instructions Bit sliced AES gives around 45% speedup on Cortex-A15 for encryption and around 25% for decryption. This implementation of the AES algorithm does not rely on any lookup tables so it is believed to be invulnerable to cache timing attacks. This algorithm processes up to 8 blocks in parallel in constant time. This means that it is not usable by chaining modes that are strictly sequential in nature, such as CBC encryption. CBC decryption, however, can benefit from this implementation and runs about 25% faster. The other chaining modes implemented in this module, XTS and CTR, can execute fully in parallel in both directions. The core code has been adopted from the OpenSSL project (in collaboration with the original author, on cc). For ease of maintenance, this version is identical to the upstream OpenSSL code, i.e., all modifications that were required to make it suitable for inclusion into the kernel have been made upstream. The original can be found here: http://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=6f6a6130 Note to integrators: While this implementation is significantly faster than the existing table based ones (generic or ARM asm), especially in CTR mode, the effects on power efficiency are unclear as of yet. This code does fundamentally more work, by calculating values that the table based code obtains by a simple lookup; only by doing all of that work in a SIMD fashion, it manages to perform better. Cc: Andy Polyakov Acked-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index a2c8385..81cda39 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -3,7 +3,17 @@ # obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o +obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o -aes-arm-y := aes-armv4.o aes_glue.o -sha1-arm-y := sha1-armv4-large.o sha1_glue.o +aes-arm-y := aes-armv4.o aes_glue.o +aes-arm-bs-y := aesbs-core.o aesbs-glue.o +sha1-arm-y := sha1-armv4-large.o sha1_glue.o + +quiet_cmd_perl = PERL $@ + cmd_perl = $(PERL) $(<) > $(@) + +$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl + $(call cmd,perl) + +.PRECIOUS: $(obj)/aesbs-core.S diff --git a/arch/arm/crypto/aesbs-core.S_shipped b/arch/arm/crypto/aesbs-core.S_shipped new file mode 100644 index 0000000..64205d4 --- /dev/null +++ b/arch/arm/crypto/aesbs-core.S_shipped @@ -0,0 +1,2544 @@ + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel +@ . Permission to use under GPL terms is +@ granted. +@ ==================================================================== + +@ Bit-sliced AES for ARM NEON +@ +@ February 2012. +@ +@ This implementation is direct adaptation of bsaes-x86_64 module for +@ ARM NEON. Except that this module is endian-neutral [in sense that +@ it can be compiled for either endianness] by courtesy of vld1.8's +@ neutrality. Initial version doesn't implement interface to OpenSSL, +@ only low-level primitives and unsupported entry points, just enough +@ to collect performance results, which for Cortex-A8 core are: +@ +@ encrypt 19.5 cycles per byte processed with 128-bit key +@ decrypt 22.1 cycles per byte processed with 128-bit key +@ key conv. 440 cycles per 128-bit key/0.18 of 8x block +@ +@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +@ which is [much] worse than anticipated (for further details see +@ http://www.openssl.org/~appro/Snapdragon-S4.html). +@ +@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +@ manages in 20.0 cycles]. +@ +@ When comparing to x86_64 results keep in mind that NEON unit is +@ [mostly] single-issue and thus can't [fully] benefit from +@ instruction-level parallelism. And when comparing to aes-armv4 +@ results keep in mind key schedule conversion overhead (see +@ bsaes-x86_64.pl for further details)... +@ +@ + +@ April-August 2013 +@ +@ Add CBC, CTR and XTS subroutines, adapt for kernel use. +@ +@ + +#ifndef __KERNEL__ +# include "arm_arch.h" + +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_ARCH__>=7 +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#ifdef __thumb2__ +.thumb +#else +.code 32 +#endif + +.fpu neon + +.type _bsaes_decrypt8,%function +.align 4 +_bsaes_decrypt8: + adr r6,_bsaes_decrypt8 + vldmia r4!, {q9} @ round 0 key + add r6,r6,#.LM0ISR-_bsaes_decrypt8 + + vldmia r6!, {q8} @ .LM0ISR + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b .Ldec_sbox +.align 4 +.Ldec_loop: + vldmia r4!, {q8-q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +.Ldec_sbox: + veor q1, q1, q4 + veor q3, q3, q4 + + veor q4, q4, q7 + veor q1, q1, q6 + veor q2, q2, q7 + veor q6, q6, q4 + + veor q0, q0, q1 + veor q2, q2, q5 + veor q7, q7, q6 + veor q3, q3, q0 + veor q5, q5, q0 + veor q1, q1, q3 + veor q11, q3, q0 + veor q10, q7, q4 + veor q9, q1, q6 + veor q13, q4, q0 + vmov q8, q10 + veor q12, q5, q2 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q6, q2 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q3, q7 + veor q12, q1, q5 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q4, q6 + veor q9, q9, q14 + vand q13, q0, q2 + vand q14, q7, q1 + vorr q15, q3, q5 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q5, q2 + veor q8, q1, q6 + veor q10, q15, q14 + vand q10, q10, q5 + veor q5, q5, q1 + vand q11, q1, q15 + vand q5, q5, q14 + veor q1, q11, q10 + veor q5, q5, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q2 + veor q12, q12, q8 + veor q2, q2, q6 + vand q8, q8, q15 + vand q6, q6, q13 + vand q12, q12, q14 + vand q2, q2, q9 + veor q8, q8, q12 + veor q2, q2, q6 + veor q12, q12, q11 + veor q6, q6, q10 + veor q5, q5, q12 + veor q2, q2, q12 + veor q1, q1, q8 + veor q6, q6, q8 + + veor q12, q3, q0 + veor q8, q7, q4 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q4 + vand q8, q8, q15 + vand q4, q4, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q4 + veor q12, q12, q11 + veor q4, q4, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q3 + veor q3, q3, q7 + vand q11, q7, q15 + vand q3, q3, q14 + veor q7, q11, q10 + veor q3, q3, q11 + veor q3, q3, q12 + veor q0, q0, q12 + veor q7, q7, q8 + veor q4, q4, q8 + veor q1, q1, q7 + veor q6, q6, q5 + + veor q4, q4, q1 + veor q2, q2, q7 + veor q5, q5, q7 + veor q4, q4, q2 + veor q7, q7, q0 + veor q4, q4, q5 + veor q3, q3, q6 + veor q6, q6, q1 + veor q3, q3, q4 + + veor q4, q4, q0 + veor q7, q7, q3 + subs r5,r5,#1 + bcc .Ldec_done + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 q8, q0, q0, #8 + vext.8 q14, q3, q3, #8 + vext.8 q15, q5, q5, #8 + veor q8, q8, q0 + vext.8 q9, q1, q1, #8 + veor q14, q14, q3 + vext.8 q10, q6, q6, #8 + veor q15, q15, q5 + vext.8 q11, q4, q4, #8 + veor q9, q9, q1 + vext.8 q12, q2, q2, #8 + veor q10, q10, q6 + vext.8 q13, q7, q7, #8 + veor q11, q11, q4 + veor q12, q12, q2 + veor q13, q13, q7 + + veor q0, q0, q14 + veor q1, q1, q14 + veor q6, q6, q8 + veor q2, q2, q10 + veor q4, q4, q9 + veor q1, q1, q15 + veor q6, q6, q15 + veor q2, q2, q14 + veor q7, q7, q11 + veor q4, q4, q14 + veor q3, q3, q12 + veor q2, q2, q15 + veor q7, q7, q15 + veor q5, q5, q13 + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q6, q6, #12 + veor q1, q1, q9 + vext.8 q11, q4, q4, #12 + veor q6, q6, q10 + vext.8 q12, q2, q2, #12 + veor q4, q4, q11 + vext.8 q13, q7, q7, #12 + veor q2, q2, q12 + vext.8 q14, q3, q3, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q3, q3, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q2 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q2, q2, #8 + veor q12, q12, q4 + vext.8 q9, q7, q7, #8 + veor q15, q15, q3 + vext.8 q2, q4, q4, #8 + veor q11, q11, q6 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q4, q3, q3, #8 + veor q11, q11, q5 + vext.8 q3, q6, q6, #8 + veor q5, q9, q13 + veor q11, q11, q2 + veor q7, q7, q15 + veor q6, q4, q14 + veor q4, q8, q12 + veor q2, q3, q10 + vmov q3, q11 + @ vmov q5, q9 + vldmia r6, {q12} @ .LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq r6,r6,#0x10 + bne .Ldec_loop + vldmia r6, {q12} @ .LISRM0 + b .Ldec_loop +.align 4 +.Ldec_done: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q3, #1 + vshr.u64 q11, q2, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q4 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q4, q4, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q2, #2 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q3, q3, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q4 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q4, q4, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q4, #4 + vshr.u64 q11, q6, #4 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q4, q4, q10 + veor q6, q6, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q6, q6, q8 + veor q4, q4, q8 + veor q2, q2, q8 + veor q7, q7, q8 + veor q3, q3, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR: @ InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR: @ ShiftRows constants + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: + .quad 0x090d01050c000408, 0x03070b0f060a0e02 +.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr r6,_bsaes_encrypt8 + vldmia r4!, {q9} @ round 0 key + sub r6,r6,#_bsaes_encrypt8-.LM0SR + + vldmia r6!, {q8} @ .LM0SR +_bsaes_encrypt8_alt: + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 +_bsaes_encrypt8_bitslice: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: + vldmia r4!, {q8-q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +.Lenc_sbox: + veor q2, q2, q1 + veor q5, q5, q6 + veor q3, q3, q0 + veor q6, q6, q2 + veor q5, q5, q0 + + veor q6, q6, q3 + veor q3, q3, q7 + veor q7, q7, q5 + veor q3, q3, q4 + veor q4, q4, q5 + + veor q2, q2, q7 + veor q3, q3, q1 + veor q1, q1, q5 + veor q11, q7, q4 + veor q10, q1, q2 + veor q9, q5, q3 + veor q13, q2, q4 + vmov q8, q10 + veor q12, q6, q0 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q3, q0 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q7, q1 + veor q12, q5, q6 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q2, q3 + veor q9, q9, q14 + vand q13, q4, q0 + vand q14, q1, q5 + vorr q15, q7, q6 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q6, q0 + veor q8, q5, q3 + veor q10, q15, q14 + vand q10, q10, q6 + veor q6, q6, q5 + vand q11, q5, q15 + vand q6, q6, q14 + veor q5, q11, q10 + veor q6, q6, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q3 + vand q8, q8, q15 + vand q3, q3, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q3 + veor q12, q12, q11 + veor q3, q3, q10 + veor q6, q6, q12 + veor q0, q0, q12 + veor q5, q5, q8 + veor q3, q3, q8 + + veor q12, q7, q4 + veor q8, q1, q2 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q4 + veor q12, q12, q8 + veor q4, q4, q2 + vand q8, q8, q15 + vand q2, q2, q13 + vand q12, q12, q14 + vand q4, q4, q9 + veor q8, q8, q12 + veor q4, q4, q2 + veor q12, q12, q11 + veor q2, q2, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q7 + veor q7, q7, q1 + vand q11, q1, q15 + vand q7, q7, q14 + veor q1, q11, q10 + veor q7, q7, q11 + veor q7, q7, q12 + veor q4, q4, q12 + veor q1, q1, q8 + veor q2, q2, q8 + veor q7, q7, q0 + veor q1, q1, q6 + veor q6, q6, q0 + veor q4, q4, q7 + veor q0, q0, q1 + + veor q1, q1, q5 + veor q5, q5, q2 + veor q2, q2, q3 + veor q3, q3, q5 + veor q4, q4, q5 + + veor q6, q6, q3 + subs r5,r5,#1 + bcc .Lenc_done + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q4, q4, #12 + veor q1, q1, q9 + vext.8 q11, q6, q6, #12 + veor q4, q4, q10 + vext.8 q12, q3, q3, #12 + veor q6, q6, q11 + vext.8 q13, q7, q7, #12 + veor q3, q3, q12 + vext.8 q14, q2, q2, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q2, q2, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q3 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q3, q3, #8 + veor q12, q12, q6 + vext.8 q9, q7, q7, #8 + veor q15, q15, q2 + vext.8 q3, q6, q6, #8 + veor q11, q11, q4 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q6, q2, q2, #8 + veor q11, q11, q5 + vext.8 q2, q4, q4, #8 + veor q5, q9, q13 + veor q4, q8, q12 + veor q3, q3, q11 + veor q7, q7, q15 + veor q6, q6, q14 + @ vmov q4, q8 + veor q2, q2, q10 + @ vmov q5, q9 + vldmia r6, {q12} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq r6,r6,#0x10 + bne .Lenc_loop + vldmia r6, {q12} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q3, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q3, q3, q11 + vshr.u64 q10, q4, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q6 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q6, q6, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q4, q4, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q3, #2 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q3, q3, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q6 + veor q11, q11, q4 + vand q10, q10, q9 + vand q11, q11, q9 + veor q6, q6, q10 + vshl.u64 q10, q10, #2 + veor q4, q4, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q6, #4 + vshr.u64 q11, q4, #4 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q4, q4, q8 + veor q6, q6, q8 + veor q3, q3, q8 + veor q7, q7, q8 + veor q2, q2, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr r6,_bsaes_key_convert + vld1.8 {q7}, [r4]! @ load round 0 key + sub r6,r6,#_bsaes_key_convert-.LM0 + vld1.8 {q15}, [r4]! @ load round 1 key + + vmov.i8 q8, #0x01 @ bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + vldmia r6, {q14} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 q7, q7 + vrev32.8 q15, q15 +#endif + sub r5,r5,#1 + vstmia r12!, {q7} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 d14,{q15},d28 + vtbl.8 d15,{q15},d29 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.8 {q15}, [r4]! @ load next round key + vmvn q0, q0 @ "pnot" + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 +#ifdef __ARMEL__ + vrev32.8 q15, q15 +#endif + subs r5,r5,#1 + vstmia r12!,{q0-q7} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 q7,#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +.extern AES_cbc_encrypt +.extern AES_decrypt + +.global bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,%function +.align 5 +bsaes_cbc_encrypt: +#ifndef __KERNEL__ + cmp r2, #128 +#ifndef __thumb__ + blo AES_cbc_encrypt +#else + bhs 1f + b AES_cbc_encrypt +1: +#endif +#endif + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ IV is 1st arg on the stack + mov r2, r2, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + vldmia sp, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia sp, {q7} +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, r3, #248 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} + +.align 2 +0: +#endif + + vld1.8 {q15}, [r8] @ load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs r2, r2, #0x8 + bmi .Lcbc_dec_loop_finish + + vld1.8 {q0-q1}, [r0]! @ load input + vld1.8 {q2-q3}, [r0]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + vld1.8 {q4-q5}, [r0]! + mov r5, r10 + vld1.8 {q6-q7}, [r0] + sub r0, r0, #0x60 + vstmia r9, {q15} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10-q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12-q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q14-q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0-q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + veor q5, q5, q14 + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds r2, r2, #8 + beq .Lcbc_dec_done + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo .Lcbc_dec_one + vld1.8 {q1}, [r0]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + mov r5, r10 + vstmia r9, {q15} @ put aside IV + beq .Lcbc_dec_two + vld1.8 {q2}, [r0]! + cmp r2, #4 + blo .Lcbc_dec_three + vld1.8 {q3}, [r0]! + beq .Lcbc_dec_four + vld1.8 {q4}, [r0]! + cmp r2, #6 + blo .Lcbc_dec_five + vld1.8 {q5}, [r0]! + beq .Lcbc_dec_six + vld1.8 {q6}, [r0]! + sub r0, r0, #0x70 + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10-q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12-q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0-q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub r0, r0, #0x60 + bl _bsaes_decrypt8 + vldmia r9,{q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10-q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0-q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub r0, r0, #0x50 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10-q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0-q1}, [r1]! @ write output + veor q2, q2, q11 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub r0, r0, #0x40 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0-q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub r0, r0, #0x30 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8-q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vst1.8 {q0-q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub r0, r0, #0x20 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! @ reload input + veor q1, q1, q8 + vst1.8 {q0-q1}, [r1]! @ write output + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub r0, r0, #0x10 + mov r10, r1 @ save original out pointer + mov r1, r9 @ use the iv scratch space as out buffer + mov r2, r3 + vmov q4,q15 @ just in case ensure that IV + vmov q5,q0 @ and input are preserved + bl AES_decrypt + vld1.8 {q0}, [r9,:64] @ load result + veor q0, q0, q4 @ ^= IV + vmov q15, q5 @ q5 holds input + vst1.8 {q0}, [r10] @ write output + +.Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +.Lcbc_dec_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r9 + bne .Lcbc_dec_bzero +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + vst1.8 {q15}, [r8] @ return IV + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +.extern AES_encrypt +.global bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + cmp r2, #8 @ use plain AES for + blo .Lctr_enc_short @ small sizes + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + + vld1.8 {q0}, [r8] @ load counter + add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 + vldmia sp, {q4} @ load round0 key +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + +.align 2 +0: add r12, r3, #248 + vld1.8 {q0}, [r8] @ load counter + adrl r8, .LREVM0SR @ borrow r8 + vldmia r12, {q4} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 q8,#1 @ compose 1<<96 + veor q9,q9,q9 + vrev32.8 q0,q0 + vext.8 q8,q9,q8,#4 + vrev32.8 q4,q4 + vadd.u32 q9,q8,q8 @ compose 2<<96 + vstmia sp, {q4} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 q10, q8, q9 @ compose 3<<96 + vadd.u32 q1, q0, q8 @ +1 + vadd.u32 q2, q0, q9 @ +2 + vadd.u32 q3, q0, q10 @ +3 + vadd.u32 q4, q1, q10 + vadd.u32 q5, q2, q10 + vadd.u32 q6, q3, q10 + vadd.u32 q7, q4, q10 + vadd.u32 q10, q5, q10 @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia sp, {q9} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x10 @ pass next round key +#else + add r4, r3, #264 +#endif + vldmia r8, {q8} @ .LREVM0SR + mov r5, r10 @ pass rounds + vstmia r9, {q10} @ save next counter + sub r6, r8, #.LREVM0SR-.LSR @ pass constants + + bl _bsaes_encrypt8_alt + + subs r2, r2, #8 + blo .Lctr_enc_loop_done + + vld1.8 {q8-q9}, [r0]! @ load input + vld1.8 {q10-q11}, [r0]! + veor q0, q8 + veor q1, q9 + vld1.8 {q12-q13}, [r0]! + veor q4, q10 + veor q6, q11 + vld1.8 {q14-q15}, [r0]! + veor q3, q12 + vst1.8 {q0-q1}, [r1]! @ write output + veor q7, q13 + veor q2, q14 + vst1.8 {q4}, [r1]! + veor q5, q15 + vst1.8 {q6}, [r1]! + vmov.i32 q8, #1 @ compose 1<<96 + vst1.8 {q3}, [r1]! + veor q9, q9, q9 + vst1.8 {q7}, [r1]! + vext.8 q8, q9, q8, #4 + vst1.8 {q2}, [r1]! + vadd.u32 q9,q8,q8 @ compose 2<<96 + vst1.8 {q5}, [r1]! + vldmia r9, {q0} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add r2, r2, #8 + vld1.8 {q8}, [r0]! @ load input + veor q0, q8 + vst1.8 {q0}, [r1]! @ write output + cmp r2, #2 + blo .Lctr_enc_done + vld1.8 {q9}, [r0]! + veor q1, q9 + vst1.8 {q1}, [r1]! + beq .Lctr_enc_done + vld1.8 {q10}, [r0]! + veor q4, q10 + vst1.8 {q4}, [r1]! + cmp r2, #4 + blo .Lctr_enc_done + vld1.8 {q11}, [r0]! + veor q6, q11 + vst1.8 {q6}, [r1]! + beq .Lctr_enc_done + vld1.8 {q12}, [r0]! + veor q3, q12 + vst1.8 {q3}, [r1]! + cmp r2, #6 + blo .Lctr_enc_done + vld1.8 {q13}, [r0]! + veor q7, q13 + vst1.8 {q7}, [r1]! + beq .Lctr_enc_done + vld1.8 {q14}, [r0] + veor q2, q14 + vst1.8 {q2}, [r1]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r9 + bne .Lctr_enc_bzero +#else + vstmia sp, {q0-q1} +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.align 4 +.Lctr_enc_short: + ldr ip, [sp] @ ctr pointer is passed on stack + stmdb sp!, {r4-r8, lr} + + mov r4, r0 @ copy arguments + mov r5, r1 + mov r6, r2 + mov r7, r3 + ldr r8, [ip, #12] @ load counter LSW + vld1.8 {q1}, [ip] @ load whole counter value +#ifdef __ARMEL__ + rev r8, r8 +#endif + sub sp, sp, #0x10 + vst1.8 {q1}, [sp,:64] @ copy counter value + sub sp, sp, #0x10 + +.Lctr_enc_short_loop: + add r0, sp, #0x10 @ input counter value + mov r1, sp @ output on the stack + mov r2, r7 @ key + + bl AES_encrypt + + vld1.8 {q0}, [r4]! @ load input + vld1.8 {q1}, [sp,:64] @ load encrypted counter + add r8, r8, #1 +#ifdef __ARMEL__ + rev r0, r8 + str r0, [sp, #0x1c] @ next counter value +#else + str r8, [sp, #0x1c] @ next counter value +#endif + veor q0,q0,q1 + vst1.8 {q0}, [r5]! @ store output + subs r6, r6, #1 + bne .Lctr_enc_short_loop + + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vstmia sp!, {q0-q1} + + ldmia sp!, {r4-r8, pc} +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,%function +.align 4 +bsaes_xts_encrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future r3 + + mov r7, r0 + mov r8, r1 + mov r9, r2 + mov r10, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0,sp @ pointer to initial tweak +#endif + + ldr r1, [r10, #240] @ get # of rounds + mov r3, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key + @ add r12, #96 @ size of bit-sliced key schedule + sub r12, #48 @ place for tweak[9] + + @ populate the key schedule + mov r4, r10 @ pass key + mov r5, r1 @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + veor q7, q7, q15 @ fix up last round key + vstmia r12, {q7} @ save last round key +#else + ldr r12, [r10, #244] + eors r12, #1 + beq 0f + + str r12, [r10, #244] + mov r4, r10 @ pass key + mov r5, r1 @ pass # of rounds + add r12, r10, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7, q7, q15 @ fix up last round key + vstmia r12, {q7} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + + vld1.8 {q8}, [r0] @ initial tweak + adr r2, .Lxts_magic + + subs r9, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + vldmia r2, {q5} @ load XTS magic + vshr.s64 q6, q8, #63 + mov r0, sp + vand q6, q6, q5 + vadd.u64 q9, q8, q8 + vst1.64 {q8}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q9, #63 + veor q9, q9, q6 + vand q7, q7, q5 + vadd.u64 q10, q9, q9 + vst1.64 {q9}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q10, #63 + veor q10, q10, q7 + vand q6, q6, q5 + vld1.8 {q0}, [r7]! + vadd.u64 q11, q10, q10 + vst1.64 {q10}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q11, #63 + veor q11, q11, q6 + vand q7, q7, q5 + vld1.8 {q1}, [r7]! + veor q0, q0, q8 + vadd.u64 q12, q11, q11 + vst1.64 {q11}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q12, #63 + veor q12, q12, q7 + vand q6, q6, q5 + vld1.8 {q2}, [r7]! + veor q1, q1, q9 + vadd.u64 q13, q12, q12 + vst1.64 {q12}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q13, #63 + veor q13, q13, q6 + vand q7, q7, q5 + vld1.8 {q3}, [r7]! + veor q2, q2, q10 + vadd.u64 q14, q13, q13 + vst1.64 {q13}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q14, #63 + veor q14, q14, q7 + vand q6, q6, q5 + vld1.8 {q4}, [r7]! + veor q3, q3, q11 + vadd.u64 q15, q14, q14 + vst1.64 {q14}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q15, #63 + veor q15, q15, q6 + vand q7, q7, q5 + vld1.8 {q5}, [r7]! + veor q4, q4, q12 + vadd.u64 q8, q15, q15 + vst1.64 {q15}, [r0,:128]! + vswp d15,d14 + veor q8, q8, q7 + vst1.64 {q8}, [r0,:128] @ next round tweak + + vld1.8 {q6-q7}, [r7]! + veor q5, q5, q13 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q6, q6, q14 + mov r5, r1 @ pass rounds + veor q7, q7, q15 + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q6, q11 + vld1.64 {q14-q15}, [r0,:128]! + veor q10, q3, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + veor q12, q2, q14 + vst1.8 {q10-q11}, [r8]! + veor q13, q5, q15 + vst1.8 {q12-q13}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + + subs r9, #0x80 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds r9, #0x70 + bmi .Lxts_enc_done + + vldmia r2, {q5} @ load XTS magic + vshr.s64 q7, q8, #63 + mov r0, sp + vand q7, q7, q5 + vadd.u64 q9, q8, q8 + vst1.64 {q8}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q9, #63 + veor q9, q9, q7 + vand q6, q6, q5 + vadd.u64 q10, q9, q9 + vst1.64 {q9}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q10, #63 + veor q10, q10, q6 + vand q7, q7, q5 + vld1.8 {q0}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_1 + vadd.u64 q11, q10, q10 + vst1.64 {q10}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q11, #63 + veor q11, q11, q7 + vand q6, q6, q5 + vld1.8 {q1}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_2 + veor q0, q0, q8 + vadd.u64 q12, q11, q11 + vst1.64 {q11}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q12, #63 + veor q12, q12, q6 + vand q7, q7, q5 + vld1.8 {q2}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_3 + veor q1, q1, q9 + vadd.u64 q13, q12, q12 + vst1.64 {q12}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q13, #63 + veor q13, q13, q7 + vand q6, q6, q5 + vld1.8 {q3}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_4 + veor q2, q2, q10 + vadd.u64 q14, q13, q13 + vst1.64 {q13}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q14, #63 + veor q14, q14, q6 + vand q7, q7, q5 + vld1.8 {q4}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_5 + veor q3, q3, q11 + vadd.u64 q15, q14, q14 + vst1.64 {q14}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q15, #63 + veor q15, q15, q7 + vand q6, q6, q5 + vld1.8 {q5}, [r7]! + subs r9, #0x10 + bmi .Lxts_enc_6 + veor q4, q4, q12 + sub r9, #0x10 + vst1.64 {q15}, [r0,:128] @ next round tweak + + vld1.8 {q6}, [r7]! + veor q5, q5, q13 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q6, q6, q14 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q6, q11 + vld1.64 {q14}, [r0,:128]! + veor q10, q3, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + veor q12, q2, q14 + vst1.8 {q10-q11}, [r8]! + vst1.8 {q12}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_6: + vst1.64 {q14}, [r0,:128] @ next round tweak + + veor q4, q4, q12 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q5, q5, q13 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q6, q11 + veor q10, q3, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + vst1.8 {q10-q11}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done + +@ put this in range for both ARM and Thumb mode adr instructions +.align 5 +.Lxts_magic: + .quad 1, 0x87 + +.align 5 +.Lxts_enc_5: + vst1.64 {q13}, [r0,:128] @ next round tweak + + veor q3, q3, q11 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q4, q4, q12 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12}, [r0,:128]! + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q6, q11 + veor q10, q3, q12 + vst1.8 {q8-q9}, [r8]! + vst1.8 {q10}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_4: + vst1.64 {q12}, [r0,:128] @ next round tweak + + veor q2, q2, q10 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q3, q3, q11 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q6, q11 + vst1.8 {q8-q9}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_3: + vst1.64 {q11}, [r0,:128] @ next round tweak + + veor q1, q1, q9 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q2, q2, q10 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + veor q8, q4, q10 + vst1.8 {q0-q1}, [r8]! + vst1.8 {q8}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_2: + vst1.64 {q10}, [r0,:128] @ next round tweak + + veor q0, q0, q8 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q1, q1, q9 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + vst1.8 {q0-q1}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_1: + mov r0, sp + veor q0, q8 + mov r1, sp + vst1.8 {q0}, [sp,:128] + mov r2, r10 + mov r4, r3 @ preserve fp + + bl AES_encrypt + + vld1.8 {q0}, [sp,:128] + veor q0, q0, q8 + vst1.8 {q0}, [r8]! + mov r3, r4 + + vmov q8, q9 @ next round tweak + +.Lxts_enc_done: +#ifndef XTS_CHAIN_TWEAK + adds r9, #0x10 + beq .Lxts_enc_ret + sub r6, r8, #0x10 + +.Lxts_enc_steal: + ldrb r0, [r7], #1 + ldrb r1, [r8, #-0x10] + strb r0, [r8, #-0x10] + strb r1, [r8], #1 + + subs r9, #1 + bhi .Lxts_enc_steal + + vld1.8 {q0}, [r6] + mov r0, sp + veor q0, q0, q8 + mov r1, sp + vst1.8 {q0}, [sp,:128] + mov r2, r10 + mov r4, r3 @ preserve fp + + bl AES_encrypt + + vld1.8 {q0}, [sp,:128] + veor q0, q0, q8 + vst1.8 {q0}, [r6] + mov r3, r4 +#endif + +.Lxts_enc_ret: + bic r0, r3, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_enc_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_enc_bzero + + mov sp, r3 +#ifdef XTS_CHAIN_TWEAK + vst1.8 {q8}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,%function +.align 4 +bsaes_xts_decrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future r3 + + mov r7, r0 + mov r8, r1 + mov r9, r2 + mov r10, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0, sp @ pointer to initial tweak +#endif + + ldr r1, [r10, #240] @ get # of rounds + mov r3, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r1, lsl#7 @ 128 bytes per inner round key + @ add r12, #96 @ size of bit-sliced key schedule + sub r12, #48 @ place for tweak[9] + + @ populate the key schedule + mov r4, r10 @ pass key + mov r5, r1 @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + add r4, sp, #0x90 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} +#else + ldr r12, [r10, #244] + eors r12, #1 + beq 0f + + str r12, [r10, #244] + mov r4, r10 @ pass key + mov r5, r1 @ pass # of rounds + add r12, r10, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, r10, #248 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + vld1.8 {q8}, [r0] @ initial tweak + adr r2, .Lxts_magic + + tst r9, #0xf @ if not multiple of 16 + it ne @ Thumb2 thing, sanity check in ARM + subne r9, #0x10 @ subtract another 16 bytes + subs r9, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + vldmia r2, {q5} @ load XTS magic + vshr.s64 q6, q8, #63 + mov r0, sp + vand q6, q6, q5 + vadd.u64 q9, q8, q8 + vst1.64 {q8}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q9, #63 + veor q9, q9, q6 + vand q7, q7, q5 + vadd.u64 q10, q9, q9 + vst1.64 {q9}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q10, #63 + veor q10, q10, q7 + vand q6, q6, q5 + vld1.8 {q0}, [r7]! + vadd.u64 q11, q10, q10 + vst1.64 {q10}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q11, #63 + veor q11, q11, q6 + vand q7, q7, q5 + vld1.8 {q1}, [r7]! + veor q0, q0, q8 + vadd.u64 q12, q11, q11 + vst1.64 {q11}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q12, #63 + veor q12, q12, q7 + vand q6, q6, q5 + vld1.8 {q2}, [r7]! + veor q1, q1, q9 + vadd.u64 q13, q12, q12 + vst1.64 {q12}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q13, #63 + veor q13, q13, q6 + vand q7, q7, q5 + vld1.8 {q3}, [r7]! + veor q2, q2, q10 + vadd.u64 q14, q13, q13 + vst1.64 {q13}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q14, #63 + veor q14, q14, q7 + vand q6, q6, q5 + vld1.8 {q4}, [r7]! + veor q3, q3, q11 + vadd.u64 q15, q14, q14 + vst1.64 {q14}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q15, #63 + veor q15, q15, q6 + vand q7, q7, q5 + vld1.8 {q5}, [r7]! + veor q4, q4, q12 + vadd.u64 q8, q15, q15 + vst1.64 {q15}, [r0,:128]! + vswp d15,d14 + veor q8, q8, q7 + vst1.64 {q8}, [r0,:128] @ next round tweak + + vld1.8 {q6-q7}, [r7]! + veor q5, q5, q13 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q6, q6, q14 + mov r5, r1 @ pass rounds + veor q7, q7, q15 + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q4, q11 + vld1.64 {q14-q15}, [r0,:128]! + veor q10, q2, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + veor q12, q3, q14 + vst1.8 {q10-q11}, [r8]! + veor q13, q5, q15 + vst1.8 {q12-q13}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + + subs r9, #0x80 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds r9, #0x70 + bmi .Lxts_dec_done + + vldmia r2, {q5} @ load XTS magic + vshr.s64 q7, q8, #63 + mov r0, sp + vand q7, q7, q5 + vadd.u64 q9, q8, q8 + vst1.64 {q8}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q9, #63 + veor q9, q9, q7 + vand q6, q6, q5 + vadd.u64 q10, q9, q9 + vst1.64 {q9}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q10, #63 + veor q10, q10, q6 + vand q7, q7, q5 + vld1.8 {q0}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_1 + vadd.u64 q11, q10, q10 + vst1.64 {q10}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q11, #63 + veor q11, q11, q7 + vand q6, q6, q5 + vld1.8 {q1}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_2 + veor q0, q0, q8 + vadd.u64 q12, q11, q11 + vst1.64 {q11}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q12, #63 + veor q12, q12, q6 + vand q7, q7, q5 + vld1.8 {q2}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_3 + veor q1, q1, q9 + vadd.u64 q13, q12, q12 + vst1.64 {q12}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q13, #63 + veor q13, q13, q7 + vand q6, q6, q5 + vld1.8 {q3}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_4 + veor q2, q2, q10 + vadd.u64 q14, q13, q13 + vst1.64 {q13}, [r0,:128]! + vswp d13,d12 + vshr.s64 q7, q14, #63 + veor q14, q14, q6 + vand q7, q7, q5 + vld1.8 {q4}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_5 + veor q3, q3, q11 + vadd.u64 q15, q14, q14 + vst1.64 {q14}, [r0,:128]! + vswp d15,d14 + vshr.s64 q6, q15, #63 + veor q15, q15, q7 + vand q6, q6, q5 + vld1.8 {q5}, [r7]! + subs r9, #0x10 + bmi .Lxts_dec_6 + veor q4, q4, q12 + sub r9, #0x10 + vst1.64 {q15}, [r0,:128] @ next round tweak + + vld1.8 {q6}, [r7]! + veor q5, q5, q13 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q6, q6, q14 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q4, q11 + vld1.64 {q14}, [r0,:128]! + veor q10, q2, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + veor q12, q3, q14 + vst1.8 {q10-q11}, [r8]! + vst1.8 {q12}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_6: + vst1.64 {q14}, [r0,:128] @ next round tweak + + veor q4, q4, q12 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q5, q5, q13 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12-q13}, [r0,:128]! + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q4, q11 + veor q10, q2, q12 + vst1.8 {q8-q9}, [r8]! + veor q11, q7, q13 + vst1.8 {q10-q11}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_5: + vst1.64 {q13}, [r0,:128] @ next round tweak + + veor q3, q3, q11 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q4, q4, q12 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + vld1.64 {q12}, [r0,:128]! + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q4, q11 + veor q10, q2, q12 + vst1.8 {q8-q9}, [r8]! + vst1.8 {q10}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_4: + vst1.64 {q12}, [r0,:128] @ next round tweak + + veor q2, q2, q10 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q3, q3, q11 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10-q11}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + veor q9, q4, q11 + vst1.8 {q8-q9}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_3: + vst1.64 {q11}, [r0,:128] @ next round tweak + + veor q1, q1, q9 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q2, q2, q10 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + vld1.64 {q10}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + veor q8, q6, q10 + vst1.8 {q0-q1}, [r8]! + vst1.8 {q8}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_2: + vst1.64 {q10}, [r0,:128] @ next round tweak + + veor q0, q0, q8 +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, r10, #248 @ pass key schedule +#endif + veor q1, q1, q9 + mov r5, r1 @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {q8-q9}, [r0,:128]! + veor q0, q0, q8 + veor q1, q1, q9 + vst1.8 {q0-q1}, [r8]! + + vld1.64 {q8}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_1: + mov r0, sp + veor q0, q8 + mov r1, sp + vst1.8 {q0}, [sp,:128] + mov r2, r10 + mov r4, r3 @ preserve fp + mov r5, r2 @ preserve magic + + bl AES_decrypt + + vld1.8 {q0}, [sp,:128] + veor q0, q0, q8 + vst1.8 {q0}, [r8]! + mov r3, r4 + mov r2, r5 + + vmov q8, q9 @ next round tweak + +.Lxts_dec_done: +#ifndef XTS_CHAIN_TWEAK + adds r9, #0x10 + beq .Lxts_dec_ret + + @ calculate one round of extra tweak for the stolen ciphertext + vldmia r2, {q5} + vshr.s64 q6, q8, #63 + vand q6, q6, q5 + vadd.u64 q9, q8, q8 + vswp d13,d12 + veor q9, q9, q6 + + @ perform the final decryption with the last tweak value + vld1.8 {q0}, [r7]! + mov r0, sp + veor q0, q0, q9 + mov r1, sp + vst1.8 {q0}, [sp,:128] + mov r2, r10 + mov r4, r3 @ preserve fp + + bl AES_decrypt + + vld1.8 {q0}, [sp,:128] + veor q0, q0, q9 + vst1.8 {q0}, [r8] + + mov r6, r8 +.Lxts_dec_steal: + ldrb r1, [r8] + ldrb r0, [r7], #1 + strb r1, [r8, #0x10] + strb r0, [r8], #1 + + subs r9, #1 + bhi .Lxts_dec_steal + + vld1.8 {q0}, [r6] + mov r0, sp + veor q0, q8 + mov r1, sp + vst1.8 {q0}, [sp,:128] + mov r2, r10 + + bl AES_decrypt + + vld1.8 {q0}, [sp,:128] + veor q0, q0, q8 + vst1.8 {q0}, [r6] + mov r3, r4 +#endif + +.Lxts_dec_ret: + bic r0, r3, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [r3, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_dec_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_dec_bzero + + mov sp, r3 +#ifdef XTS_CHAIN_TWEAK + vst1.8 {q8}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +#endif diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c new file mode 100644 index 0000000..4522366 --- /dev/null +++ b/arch/arm/crypto/aesbs-glue.c @@ -0,0 +1,434 @@ +/* + * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES + * + * Copyright (C) 2013 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include "aes_glue.h" + +#define BIT_SLICED_KEY_MAXSIZE (128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE) + +struct BS_KEY { + struct AES_KEY rk; + int converted; + u8 __aligned(8) bs[BIT_SLICED_KEY_MAXSIZE]; +} __aligned(8); + +asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in); +asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in); + +asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes, + struct BS_KEY *key, u8 iv[]); + +asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks, + struct BS_KEY *key, u8 const iv[]); + +asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes, + struct BS_KEY *key, u8 tweak[]); + +asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes, + struct BS_KEY *key, u8 tweak[]); + +struct aesbs_cbc_ctx { + struct AES_KEY enc; + struct BS_KEY dec; +}; + +struct aesbs_ctr_ctx { + struct BS_KEY enc; +}; + +struct aesbs_xts_ctx { + struct BS_KEY enc; + struct BS_KEY dec; + struct AES_KEY twkey; +}; + +static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + int bits = key_len * 8; + + if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + ctx->dec.rk = ctx->enc; + private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk); + ctx->dec.converted = 0; + return 0; +} + +static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm); + int bits = key_len * 8; + + if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + ctx->enc.converted = 0; + return 0; +} + +static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm); + int bits = key_len * 4; + + if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + ctx->dec.rk = ctx->enc.rk; + private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk); + private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey); + ctx->enc.converted = ctx->dec.converted = 0; + return 0; +} + +static int aesbs_cbc_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while (walk.nbytes) { + u32 blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *src = walk.src.virt.addr; + + if (walk.dst.virt.addr == walk.src.virt.addr) { + u8 *iv = walk.iv; + + do { + crypto_xor(src, iv, AES_BLOCK_SIZE); + AES_encrypt(src, src, &ctx->enc); + iv = src; + src += AES_BLOCK_SIZE; + } while (--blocks); + memcpy(walk.iv, iv, AES_BLOCK_SIZE); + } else { + u8 *dst = walk.dst.virt.addr; + + do { + crypto_xor(walk.iv, src, AES_BLOCK_SIZE); + AES_encrypt(walk.iv, dst, &ctx->enc); + memcpy(walk.iv, dst, AES_BLOCK_SIZE); + src += AES_BLOCK_SIZE; + dst += AES_BLOCK_SIZE; + } while (--blocks); + } + err = blkcipher_walk_done(desc, &walk, 0); + } + return err; +} + +static int aesbs_cbc_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); + + while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) { + kernel_neon_begin(); + bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, &ctx->dec, walk.iv); + kernel_neon_end(); + err = blkcipher_walk_done(desc, &walk, 0); + } + while (walk.nbytes) { + u32 blocks = walk.nbytes / AES_BLOCK_SIZE; + u8 *dst = walk.dst.virt.addr; + u8 *src = walk.src.virt.addr; + u8 bk[2][AES_BLOCK_SIZE]; + u8 *iv = walk.iv; + + do { + if (walk.dst.virt.addr == walk.src.virt.addr) + memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE); + + AES_decrypt(src, dst, &ctx->dec.rk); + crypto_xor(dst, iv, AES_BLOCK_SIZE); + + if (walk.dst.virt.addr == walk.src.virt.addr) + iv = bk[blocks & 1]; + else + iv = src; + + dst += AES_BLOCK_SIZE; + src += AES_BLOCK_SIZE; + } while (--blocks); + err = blkcipher_walk_done(desc, &walk, 0); + } + return err; +} + +static void inc_be128_ctr(__be32 ctr[], u32 addend) +{ + int i; + + for (i = 3; i >= 0; i--, addend = 1) { + u32 n = be32_to_cpu(ctr[i]) + addend; + + ctr[i] = cpu_to_be32(n); + if (n >= addend) + break; + } +} + +static int aesbs_ctr_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + u32 blocks; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); + + while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; + __be32 *ctr = (__be32 *)walk.iv; + u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]); + + /* avoid 32 bit counter overflow in the NEON code */ + if (unlikely(headroom < blocks)) { + blocks = headroom + 1; + tail = walk.nbytes - blocks * AES_BLOCK_SIZE; + } + kernel_neon_begin(); + bsaes_ctr32_encrypt_blocks(walk.src.virt.addr, + walk.dst.virt.addr, blocks, + &ctx->enc, walk.iv); + kernel_neon_end(); + inc_be128_ctr(ctr, blocks); + + nbytes -= blocks * AES_BLOCK_SIZE; + if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE) + break; + + err = blkcipher_walk_done(desc, &walk, tail); + } + if (walk.nbytes) { + u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; + u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; + u8 ks[AES_BLOCK_SIZE]; + + AES_encrypt(walk.iv, ks, &ctx->enc.rk); + if (tdst != tsrc) + memcpy(tdst, tsrc, nbytes); + crypto_xor(tdst, ks, nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + return err; +} + +static int aesbs_xts_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); + + /* generate the initial tweak */ + AES_encrypt(walk.iv, walk.iv, &ctx->twkey); + + while (walk.nbytes) { + kernel_neon_begin(); + bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, &ctx->enc, walk.iv); + kernel_neon_end(); + err = blkcipher_walk_done(desc, &walk, 0); + } + return err; +} + +static int aesbs_xts_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE); + + /* generate the initial tweak */ + AES_encrypt(walk.iv, walk.iv, &ctx->twkey); + + while (walk.nbytes) { + kernel_neon_begin(); + bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, &ctx->dec, walk.iv); + kernel_neon_end(); + err = blkcipher_walk_done(desc, &walk, 0); + } + return err; +} + +static struct crypto_alg aesbs_algs[] = { { + .cra_name = "__cbc-aes-neonbs", + .cra_driver_name = "__driver-cbc-aes-neonbs", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesbs_cbc_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_cbc_set_key, + .encrypt = aesbs_cbc_encrypt, + .decrypt = aesbs_cbc_decrypt, + }, +}, { + .cra_name = "__ctr-aes-neonbs", + .cra_driver_name = "__driver-ctr-aes-neonbs", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesbs_ctr_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_ctr_set_key, + .encrypt = aesbs_ctr_encrypt, + .decrypt = aesbs_ctr_encrypt, + }, +}, { + .cra_name = "__xts-aes-neonbs", + .cra_driver_name = "__driver-xts-aes-neonbs", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesbs_xts_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aesbs_xts_set_key, + .encrypt = aesbs_xts_encrypt, + .decrypt = aesbs_xts_decrypt, + }, +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-neonbs", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-neonbs", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-neonbs", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +} }; + +static int __init aesbs_mod_init(void) +{ + if (!cpu_has_neon()) + return -ENODEV; + + return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs)); +} + +static void __exit aesbs_mod_exit(void) +{ + crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs)); +} + +module_init(aesbs_mod_init); +module_exit(aesbs_mod_exit); + +MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL"); diff --git a/arch/arm/crypto/bsaes-armv7.pl b/arch/arm/crypto/bsaes-armv7.pl new file mode 100644 index 0000000..f3d96d9 --- /dev/null +++ b/arch/arm/crypto/bsaes-armv7.pl @@ -0,0 +1,2467 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Specific modes and adaptation for Linux kernel by Ard Biesheuvel +# . Permission to use under GPL terms is +# granted. +# ==================================================================== + +# Bit-sliced AES for ARM NEON +# +# February 2012. +# +# This implementation is direct adaptation of bsaes-x86_64 module for +# ARM NEON. Except that this module is endian-neutral [in sense that +# it can be compiled for either endianness] by courtesy of vld1.8's +# neutrality. Initial version doesn't implement interface to OpenSSL, +# only low-level primitives and unsupported entry points, just enough +# to collect performance results, which for Cortex-A8 core are: +# +# encrypt 19.5 cycles per byte processed with 128-bit key +# decrypt 22.1 cycles per byte processed with 128-bit key +# key conv. 440 cycles per 128-bit key/0.18 of 8x block +# +# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +# which is [much] worse than anticipated (for further details see +# http://www.openssl.org/~appro/Snapdragon-S4.html). +# +# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +# manages in 20.0 cycles]. +# +# When comparing to x86_64 results keep in mind that NEON unit is +# [mostly] single-issue and thus can't [fully] benefit from +# instruction-level parallelism. And when comparing to aes-armv4 +# results keep in mind key schedule conversion overhead (see +# bsaes-x86_64.pl for further details)... +# +# + +# April-August 2013 +# +# Add CBC, CTR and XTS subroutines, adapt for kernel use. +# +# + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my ($inp,$out,$len,$key)=("r0","r1","r2","r3"); +my @XMM=map("q$_",(0..15)); + +{ +my ($key,$rounds,$const)=("r4","r5","r6"); + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub Sbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InBasisChange (@b); + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); +} + +sub InBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[2], @b[2], @b[1] + veor @b[5], @b[5], @b[6] + veor @b[3], @b[3], @b[0] + veor @b[6], @b[6], @b[2] + veor @b[5], @b[5], @b[0] + + veor @b[6], @b[6], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[4], @b[4], @b[5] + + veor @b[2], @b[2], @b[7] + veor @b[3], @b[3], @b[1] + veor @b[1], @b[1], @b[5] +___ +} + +sub OutBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb +my @b=@_[0..7]; +$code.=<<___; + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] + veor @b[4], @b[4], @b[6] + veor @b[2], @b[2], @b[0] + veor @b[6], @b[6], @b[1] + + veor @b[1], @b[1], @b[5] + veor @b[5], @b[5], @b[3] + veor @b[3], @b[3], @b[7] + veor @b[7], @b[7], @b[5] + veor @b[2], @b[2], @b[5] + + veor @b[4], @b[4], @b[7] +___ +} + +sub InvSbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InvInBasisChange (@b); + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); +} + +sub InvInBasisChange { # OutBasisChange in reverse (with twist) +my @b=@_[5,1,2,6,3,7,0,4]; +$code.=<<___ + veor @b[1], @b[1], @b[7] + veor @b[4], @b[4], @b[7] + + veor @b[7], @b[7], @b[5] + veor @b[1], @b[1], @b[3] + veor @b[2], @b[2], @b[5] + veor @b[3], @b[3], @b[7] + + veor @b[6], @b[6], @b[1] + veor @b[2], @b[2], @b[0] + veor @b[5], @b[5], @b[3] + veor @b[4], @b[4], @b[6] + veor @b[0], @b[0], @b[6] + veor @b[1], @b[1], @b[4] +___ +} + +sub InvOutBasisChange { # InBasisChange in reverse +my @b=@_[2,5,7,3,6,1,0,4]; +$code.=<<___; + veor @b[1], @b[1], @b[5] + veor @b[2], @b[2], @b[7] + + veor @b[3], @b[3], @b[1] + veor @b[4], @b[4], @b[5] + veor @b[7], @b[7], @b[5] + veor @b[3], @b[3], @b[4] + veor @b[5], @b[5], @b[0] + veor @b[3], @b[3], @b[7] + veor @b[6], @b[6], @b[2] + veor @b[2], @b[2], @b[1] + veor @b[6], @b[6], @b[3] + + veor @b[3], @b[3], @b[0] + veor @b[5], @b[5], @b[6] +___ +} + +sub Mul_GF4 { +#;************************************************************* +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * +#;************************************************************* +my ($x0,$x1,$y0,$y1,$t0,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $t1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $t1, $t0 + veor $x0, $x0, $t1 +___ +} + +sub Mul_GF4_N { # not used, see next subroutine +# multiply and scale by N +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + vand $t0, $t0, $x0 + veor $x0, $x0, $x1 + vand $x1, $x1, $y0 + vand $x0, $x0, $y1 + veor $x1, $x1, $x0 + veor $x0, $x0, $t0 +___ +} + +sub Mul_GF4_N_GF4 { +# interleaved Mul_GF4_N and Mul_GF4 +my ($x0,$x1,$y0,$y1,$t0, + $x2,$x3,$y2,$y3,$t1)=@_; +$code.=<<___; + veor $t0, $y0, $y1 + veor $t1, $y2, $y3 + vand $t0, $t0, $x0 + vand $t1, $t1, $x2 + veor $x0, $x0, $x1 + veor $x2, $x2, $x3 + vand $x1, $x1, $y0 + vand $x3, $x3, $y2 + vand $x0, $x0, $y1 + vand $x2, $x2, $y3 + veor $x1, $x1, $x0 + veor $x2, $x2, $x3 + veor $x0, $x0, $t0 + veor $x3, $x3, $t1 +___ +} +sub Mul_GF16_2 { +my @x=@_[0..7]; +my @y=@_[8..11]; +my @t=@_[12..15]; +$code.=<<___; + veor @t[0], @x[0], @x[2] + veor @t[1], @x[1], @x[3] +___ + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[2], @x[3], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @x[0], @x[0], @t[0] + veor @x[2], @x[2], @t[0] + veor @x[1], @x[1], @t[1] + veor @x[3], @x[3], @t[1] + + veor @t[0], @x[4], @x[6] + veor @t[1], @x[5], @x[7] +___ + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[6], @x[7], @y[2], @y[3], @t[2]); +$code.=<<___; + veor @y[0], @y[0], @y[2] + veor @y[1], @y[1], @y[3] +___ + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]); +$code.=<<___; + veor @x[4], @x[4], @t[0] + veor @x[6], @x[6], @t[0] + veor @x[5], @x[5], @t[1] + veor @x[7], @x[7], @t[1] +___ +} +sub Inv_GF256 { +#;******************************************************************** +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * +#;******************************************************************** +my @x=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; +# direct optimizations from hardware +$code.=<<___; + veor @t[3], @x[4], @x[6] + veor @t[2], @x[5], @x[7] + veor @t[1], @x[1], @x[3] + veor @s[1], @x[7], @x[6] + vmov @t[0], @t[2] + veor @s[0], @x[0], @x[2] + + vorr @t[2], @t[2], @t[1] + veor @s[3], @t[3], @t[0] + vand @s[2], @t[3], @s[0] + vorr @t[3], @t[3], @s[0] + veor @s[0], @s[0], @t[1] + vand @t[0], @t[0], @t[1] + veor @t[1], @x[3], @x[2] + vand @s[3], @s[3], @s[0] + vand @s[1], @s[1], @t[1] + veor @t[1], @x[4], @x[5] + veor @s[0], @x[1], @x[0] + veor @t[3], @t[3], @s[1] + veor @t[2], @t[2], @s[1] + vand @s[1], @t[1], @s[0] + vorr @t[1], @t[1], @s[0] + veor @t[3], @t[3], @s[3] + veor @t[0], @t[0], @s[1] + veor @t[2], @t[2], @s[2] + veor @t[1], @t[1], @s[3] + veor @t[0], @t[0], @s[2] + vand @s[0], @x[7], @x[3] + veor @t[1], @t[1], @s[2] + vand @s[1], @x[6], @x[2] + vand @s[2], @x[5], @x[1] + vorr @s[3], @x[4], @x[0] + veor @t[3], @t[3], @s[0] + veor @t[1], @t[1], @s[2] + veor @t[0], @t[0], @s[3] + veor @t[2], @t[2], @s[1] + + @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + + @ new smaller inversion + + vand @s[2], @t[3], @t[1] + vmov @s[0], @t[0] + + veor @s[1], @t[2], @s[2] + veor @s[3], @t[0], @s[2] + veor @s[2], @t[0], @s[2] @ @s[2]=@s[3] + + vbsl @s[1], @t[1], @t[0] + vbsl @s[3], @t[3], @t[2] + veor @t[3], @t[3], @t[2] + + vbsl @s[0], @s[1], @s[2] + vbsl @t[0], @s[2], @s[1] + + vand @s[2], @s[0], @s[3] + veor @t[1], @t[1], @t[0] + + veor @s[2], @s[2], @t[3] +___ +# output in s3, s2, s1, t1 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); + +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb +} + +# AES linear components + +sub ShiftRows { +my @x=@_[0..7]; +my @t=@_[8..11]; +my $mask=pop; +$code.=<<___; + vldmia $key!, {@t[0]-@t[3]} + veor @t[0], @t[0], @x[0] + veor @t[1], @t[1], @x[1] + vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)` + vldmia $key!, {@t[0]} + veor @t[2], @t[2], @x[2] + vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)` + vldmia $key!, {@t[1]} + veor @t[3], @t[3], @x[3] + vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)` + vldmia $key!, {@t[2]} + vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)` + vldmia $key!, {@t[3]} + veor @t[0], @t[0], @x[4] + veor @t[1], @t[1], @x[5] + vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)` + veor @t[2], @t[2], @x[6] + vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)` + veor @t[3], @t[3], @x[7] + vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)` + vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)` + vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)` +___ +} + +sub MixColumns { +# modified to emit output in order suitable for feeding back to aesenc[last] +my @x=@_[0..7]; +my @t=@_[8..15]; +my $inv=@_[16]; # optional +$code.=<<___; + vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32) + vext.8 @t[2], @x[2], @x[2], #12 + veor @x[1], @x[1], @t[1] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[2], @x[2], @t[2] + vext.8 @t[4], @x[4], @x[4], #12 + veor @x[3], @x[3], @t[3] + vext.8 @t[5], @x[5], @x[5], #12 + veor @x[4], @x[4], @t[4] + vext.8 @t[6], @x[6], @x[6], #12 + veor @x[5], @x[5], @t[5] + vext.8 @t[7], @x[7], @x[7], #12 + veor @x[6], @x[6], @t[6] + + veor @t[1], @t[1], @x[0] + veor @x[7], @x[7], @t[7] + vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor @t[2], @t[2], @x[1] + veor @t[0], @t[0], @x[7] + veor @t[1], @t[1], @x[7] + vext.8 @x[1], @x[1], @x[1], #8 + veor @t[5], @t[5], @x[4] + veor @x[0], @x[0], @t[0] + veor @t[6], @t[6], @x[5] + veor @x[1], @x[1], @t[1] + vext.8 @t[0], @x[4], @x[4], #8 + veor @t[4], @t[4], @x[3] + vext.8 @t[1], @x[5], @x[5], #8 + veor @t[7], @t[7], @x[6] + vext.8 @x[4], @x[3], @x[3], #8 + veor @t[3], @t[3], @x[2] + vext.8 @x[5], @x[7], @x[7], #8 + veor @t[4], @t[4], @x[7] + vext.8 @x[3], @x[6], @x[6], #8 + veor @t[3], @t[3], @x[7] + vext.8 @x[6], @x[2], @x[2], #8 + veor @x[7], @t[1], @t[5] +___ +$code.=<<___ if (!$inv); + veor @x[2], @t[0], @t[4] + veor @x[4], @x[4], @t[3] + veor @x[5], @x[5], @t[7] + veor @x[3], @x[3], @t[6] + @ vmov @x[2], @t[0] + veor @x[6], @x[6], @t[2] + @ vmov @x[7], @t[1] +___ +$code.=<<___ if ($inv); + veor @t[3], @t[3], @x[4] + veor @x[5], @x[5], @t[7] + veor @x[2], @x[3], @t[6] + veor @x[3], @t[0], @t[4] + veor @x[4], @x[6], @t[2] + vmov @x[6], @t[3] + @ vmov @x[7], @t[1] +___ +} + +sub InvMixColumns_orig { +my @x=@_[0..7]; +my @t=@_[8..15]; + +$code.=<<___; + @ multiplication by 0x0e + vext.8 @t[7], @x[7], @x[7], #12 + vmov @t[2], @x[2] + veor @x[2], @x[2], @x[5] @ 2 5 + veor @x[7], @x[7], @x[5] @ 7 5 + vext.8 @t[0], @x[0], @x[0], #12 + vmov @t[5], @x[5] + veor @x[5], @x[5], @x[0] @ 5 0 [1] + veor @x[0], @x[0], @x[1] @ 0 1 + vext.8 @t[1], @x[1], @x[1], #12 + veor @x[1], @x[1], @x[2] @ 1 25 + veor @x[0], @x[0], @x[6] @ 01 6 [2] + vext.8 @t[3], @x[3], @x[3], #12 + veor @x[1], @x[1], @x[3] @ 125 3 [4] + veor @x[2], @x[2], @x[0] @ 25 016 [3] + veor @x[3], @x[3], @x[7] @ 3 75 + veor @x[7], @x[7], @x[6] @ 75 6 [0] + vext.8 @t[6], @x[6], @x[6], #12 + vmov @t[4], @x[4] + veor @x[6], @x[6], @x[4] @ 6 4 + veor @x[4], @x[4], @x[3] @ 4 375 [6] + veor @x[3], @x[3], @x[7] @ 375 756=36 + veor @x[6], @x[6], @t[5] @ 64 5 [7] + veor @x[3], @x[3], @t[2] @ 36 2 + vext.8 @t[5], @t[5], @t[5], #12 + veor @x[3], @x[3], @t[4] @ 362 4 [5] +___ + my @y = @x[7,5,0,2,1,3,4,6]; +$code.=<<___; + @ multiplication by 0x0b + veor @y[1], @y[1], @y[0] + veor @y[0], @y[0], @t[0] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[5] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[1], @y[1], @t[6] + veor @y[0], @y[0], @t[7] + veor @t[7], @t[7], @t[6] @ clobber t[7] + + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @y[0] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[2], @y[2], @t[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + veor @y[2], @y[2], @t[2] + veor @y[3], @y[3], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[7] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[3], @y[3], @t[3] + veor @y[6], @y[6], @t[3] + veor @y[4], @y[4], @t[3] + veor @y[7], @y[7], @t[4] + vext.8 @t[3], @t[3], @t[3], #12 + veor @y[5], @y[5], @t[4] + veor @y[7], @y[7], @t[7] + veor @t[7], @t[7], @t[5] @ clobber t[7] even more + veor @y[3], @y[3], @t[5] + veor @y[4], @y[4], @t[4] + + veor @y[5], @y[5], @t[7] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[6], @y[6], @t[7] + veor @y[4], @y[4], @t[7] + + veor @t[7], @t[7], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + + @ multiplication by 0x0d + veor @y[4], @y[4], @y[7] + veor @t[7], @t[7], @t[6] @ restore t[7] + veor @y[7], @y[7], @t[4] + vext.8 @t[6], @t[6], @t[6], #12 + veor @y[2], @y[2], @t[0] + veor @y[7], @y[7], @t[5] + vext.8 @t[7], @t[7], @t[7], #12 + veor @y[2], @y[2], @t[2] + + veor @y[3], @y[3], @y[1] + veor @y[1], @y[1], @t[1] + veor @y[0], @y[0], @t[0] + veor @y[3], @y[3], @t[0] + veor @y[1], @y[1], @t[5] + veor @y[0], @y[0], @t[5] + vext.8 @t[0], @t[0], @t[0], #12 + veor @y[1], @y[1], @t[7] + veor @y[0], @y[0], @t[6] + veor @y[3], @y[3], @y[1] + veor @y[4], @y[4], @t[1] + vext.8 @t[1], @t[1], @t[1], #12 + + veor @y[7], @y[7], @t[7] + veor @y[4], @y[4], @t[2] + veor @y[5], @y[5], @t[2] + veor @y[2], @y[2], @t[6] + veor @t[6], @t[6], @t[3] @ clobber t[6] + vext.8 @t[2], @t[2], @t[2], #12 + veor @y[4], @y[4], @y[7] + veor @y[3], @y[3], @t[6] + + veor @y[6], @y[6], @t[6] + veor @y[5], @y[5], @t[5] + vext.8 @t[5], @t[5], @t[5], #12 + veor @y[6], @y[6], @t[4] + vext.8 @t[4], @t[4], @t[4], #12 + veor @y[5], @y[5], @t[6] + veor @y[6], @y[6], @t[7] + vext.8 @t[7], @t[7], @t[7], #12 + veor @t[6], @t[6], @t[3] @ restore t[6] + vext.8 @t[3], @t[3], @t[3], #12 + + @ multiplication by 0x09 + veor @y[4], @y[4], @y[1] + veor @t[1], @t[1], @y[1] @ t[1]=y[1] + veor @t[0], @t[0], @t[5] @ clobber t[0] + vext.8 @t[6], @t[6], @t[6], #12 + veor @t[1], @t[1], @t[5] + veor @y[3], @y[3], @t[0] + veor @t[0], @t[0], @y[0] @ t[0]=y[0] + veor @t[1], @t[1], @t[6] + veor @t[6], @t[6], @t[7] @ clobber t[6] + veor @y[4], @y[4], @t[1] + veor @y[7], @y[7], @t[4] + veor @y[6], @y[6], @t[3] + veor @y[5], @y[5], @t[2] + veor @t[4], @t[4], @y[4] @ t[4]=y[4] + veor @t[3], @t[3], @y[3] @ t[3]=y[3] + veor @t[5], @t[5], @y[5] @ t[5]=y[5] + veor @t[2], @t[2], @y[2] @ t[2]=y[2] + veor @t[3], @t[3], @t[7] + veor @XMM[5], @t[5], @t[6] + veor @XMM[6], @t[6], @y[6] @ t[6]=y[6] + veor @XMM[2], @t[2], @t[6] + veor @XMM[7], @t[7], @y[7] @ t[7]=y[7] + + vmov @XMM[0], @t[0] + vmov @XMM[1], @t[1] + @ vmov @XMM[2], @t[2] + vmov @XMM[3], @t[3] + vmov @XMM[4], @t[4] + @ vmov @XMM[5], @t[5] + @ vmov @XMM[6], @t[6] + @ vmov @XMM[7], @t[7] +___ +} + +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +# Thanks to Jussi Kivilinna for providing pointer to +# +# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | +# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | +# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | +# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + +$code.=<<___; + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 @t[0], @x[0], @x[0], #8 + vext.8 @t[6], @x[6], @x[6], #8 + vext.8 @t[7], @x[7], @x[7], #8 + veor @t[0], @t[0], @x[0] + vext.8 @t[1], @x[1], @x[1], #8 + veor @t[6], @t[6], @x[6] + vext.8 @t[2], @x[2], @x[2], #8 + veor @t[7], @t[7], @x[7] + vext.8 @t[3], @x[3], @x[3], #8 + veor @t[1], @t[1], @x[1] + vext.8 @t[4], @x[4], @x[4], #8 + veor @t[2], @t[2], @x[2] + vext.8 @t[5], @x[5], @x[5], #8 + veor @t[3], @t[3], @x[3] + veor @t[4], @t[4], @x[4] + veor @t[5], @t[5], @x[5] + + veor @x[0], @x[0], @t[6] + veor @x[1], @x[1], @t[6] + veor @x[2], @x[2], @t[0] + veor @x[4], @x[4], @t[2] + veor @x[3], @x[3], @t[1] + veor @x[1], @x[1], @t[7] + veor @x[2], @x[2], @t[7] + veor @x[4], @x[4], @t[6] + veor @x[5], @x[5], @t[3] + veor @x[3], @x[3], @t[6] + veor @x[6], @x[6], @t[4] + veor @x[4], @x[4], @t[7] + veor @x[5], @x[5], @t[7] + veor @x[7], @x[7], @t[5] +___ + &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 +} + +sub swapmove { +my ($a,$b,$n,$mask,$t)=@_; +$code.=<<___; + vshr.u64 $t, $b, #$n + veor $t, $t, $a + vand $t, $t, $mask + veor $a, $a, $t + vshl.u64 $t, $t, #$n + veor $b, $b, $t +___ +} +sub swapmove2x { +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; +$code.=<<___; + vshr.u64 $t0, $b0, #$n + vshr.u64 $t1, $b1, #$n + veor $t0, $t0, $a0 + veor $t1, $t1, $a1 + vand $t0, $t0, $mask + vand $t1, $t1, $mask + veor $a0, $a0, $t0 + vshl.u64 $t0, $t0, #$n + veor $a1, $a1, $t1 + vshl.u64 $t1, $t1, #$n + veor $b0, $b0, $t0 + veor $b1, $b1, $t1 +___ +} + +sub bitslice { +my @x=reverse(@_[0..7]); +my ($t0,$t1,$t2,$t3)=@_[8..11]; +$code.=<<___; + vmov.i8 $t0,#0x55 @ compose .LBS0 + vmov.i8 $t1,#0x33 @ compose .LBS1 +___ + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); +$code.=<<___; + vmov.i8 $t0,#0x0f @ compose .LBS2 +___ + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); +} + +$code.=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" + +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_ARCH__>=7 +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#ifdef __thumb2__ +.thumb +#else +.code 32 +#endif + +.fpu neon + +.type _bsaes_decrypt8,%function +.align 4 +_bsaes_decrypt8: + adr $const,_bsaes_decrypt8 + vldmia $key!, {@XMM[9]} @ round 0 key + add $const,$const,#.LM0ISR-_bsaes_decrypt8 + + vldmia $const!, {@XMM[8]} @ .LM0ISR + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key + veor @XMM[11], @XMM[1], @XMM[9] + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` + veor @XMM[12], @XMM[2], @XMM[9] + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` + veor @XMM[13], @XMM[3], @XMM[9] + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` + veor @XMM[14], @XMM[4], @XMM[9] + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` + veor @XMM[15], @XMM[5], @XMM[9] + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` + veor @XMM[10], @XMM[6], @XMM[9] + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` + veor @XMM[11], @XMM[7], @XMM[9] + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + sub $rounds,$rounds,#1 + b .Ldec_sbox +.align 4 +.Ldec_loop: +___ + &ShiftRows (@XMM[0..7, 8..12]); +$code.=".Ldec_sbox:\n"; + &InvSbox (@XMM[0..7, 8..15]); +$code.=<<___; + subs $rounds,$rounds,#1 + bcc .Ldec_done +___ + &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); +$code.=<<___; + vldmia $const, {@XMM[12]} @ .LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq $const,$const,#0x10 + bne .Ldec_loop + vldmia $const, {@XMM[12]} @ .LISRM0 + b .Ldec_loop +.align 4 +.Ldec_done: +___ + &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); +$code.=<<___; + vldmia $key, {@XMM[8]} @ last round key + veor @XMM[6], @XMM[6], @XMM[8] + veor @XMM[4], @XMM[4], @XMM[8] + veor @XMM[2], @XMM[2], @XMM[8] + veor @XMM[7], @XMM[7], @XMM[8] + veor @XMM[3], @XMM[3], @XMM[8] + veor @XMM[5], @XMM[5], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] + veor @XMM[1], @XMM[1], @XMM[8] + bx lr +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR: @ InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR: @ ShiftRows constants + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: + .quad 0x090d01050c000408, 0x03070b0f060a0e02 +.asciz "Bit-sliced AES for NEON, CRYPTOGAMS by " +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr $const,_bsaes_encrypt8 + vldmia $key!, {@XMM[9]} @ round 0 key + sub $const,$const,#_bsaes_encrypt8-.LM0SR + + vldmia $const!, {@XMM[8]} @ .LM0SR +_bsaes_encrypt8_alt: + veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key + veor @XMM[11], @XMM[1], @XMM[9] + vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])` + veor @XMM[12], @XMM[2], @XMM[9] + vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])` + veor @XMM[13], @XMM[3], @XMM[9] + vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])` + veor @XMM[14], @XMM[4], @XMM[9] + vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])` + veor @XMM[15], @XMM[5], @XMM[9] + vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])` + veor @XMM[10], @XMM[6], @XMM[9] + vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])` + veor @XMM[11], @XMM[7], @XMM[9] + vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])` + vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])` + vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])` +_bsaes_encrypt8_bitslice: +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + sub $rounds,$rounds,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: +___ + &ShiftRows (@XMM[0..7, 8..12]); +$code.=".Lenc_sbox:\n"; + &Sbox (@XMM[0..7, 8..15]); +$code.=<<___; + subs $rounds,$rounds,#1 + bcc .Lenc_done +___ + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); +$code.=<<___; + vldmia $const, {@XMM[12]} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq $const,$const,#0x10 + bne .Lenc_loop + vldmia $const, {@XMM[12]} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: +___ + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); +$code.=<<___; + vldmia $key, {@XMM[8]} @ last round key + veor @XMM[4], @XMM[4], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[8] + veor @XMM[3], @XMM[3], @XMM[8] + veor @XMM[7], @XMM[7], @XMM[8] + veor @XMM[2], @XMM[2], @XMM[8] + veor @XMM[5], @XMM[5], @XMM[8] + veor @XMM[0], @XMM[0], @XMM[8] + veor @XMM[1], @XMM[1], @XMM[8] + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +___ +} +{ +my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6"); + +sub bitslice_key { +my @x=reverse(@_[0..7]); +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; + + &swapmove (@x[0,1],1,$bs0,$t2,$t3); +$code.=<<___; + @ &swapmove(@x[2,3],1,$t0,$t2,$t3); + vmov @x[2], @x[0] + vmov @x[3], @x[1] +___ + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); + + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); +$code.=<<___; + @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + vmov @x[4], @x[0] + vmov @x[6], @x[2] + vmov @x[5], @x[1] + vmov @x[7], @x[3] +___ + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); +} + +$code.=<<___; +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr $const,_bsaes_key_convert + vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key + sub $const,$const,#_bsaes_key_convert-.LM0 + vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key + + vmov.i8 @XMM[8], #0x01 @ bit masks + vmov.i8 @XMM[9], #0x02 + vmov.i8 @XMM[10], #0x04 + vmov.i8 @XMM[11], #0x08 + vmov.i8 @XMM[12], #0x10 + vmov.i8 @XMM[13], #0x20 + vldmia $const, {@XMM[14]} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 @XMM[7], @XMM[7] + vrev32.8 @XMM[15], @XMM[15] +#endif + sub $rounds,$rounds,#1 + vstmia $out!, {@XMM[7]} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])` + vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])` + vmov.i8 @XMM[6], #0x40 + vmov.i8 @XMM[15], #0x80 + + vtst.8 @XMM[0], @XMM[7], @XMM[8] + vtst.8 @XMM[1], @XMM[7], @XMM[9] + vtst.8 @XMM[2], @XMM[7], @XMM[10] + vtst.8 @XMM[3], @XMM[7], @XMM[11] + vtst.8 @XMM[4], @XMM[7], @XMM[12] + vtst.8 @XMM[5], @XMM[7], @XMM[13] + vtst.8 @XMM[6], @XMM[7], @XMM[6] + vtst.8 @XMM[7], @XMM[7], @XMM[15] + vld1.8 {@XMM[15]}, [$inp]! @ load next round key + vmvn @XMM[0], @XMM[0] @ "pnot" + vmvn @XMM[1], @XMM[1] + vmvn @XMM[5], @XMM[5] + vmvn @XMM[6], @XMM[6] +#ifdef __ARMEL__ + vrev32.8 @XMM[15], @XMM[15] +#endif + subs $rounds,$rounds,#1 + vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 @XMM[7],#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +___ +} + +if (0) { # following four functions are unsupported interface + # used for benchmarking... +$code.=<<___; +.globl bsaes_enc_key_convert +.type bsaes_enc_key_convert,%function +.align 4 +bsaes_enc_key_convert: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + + ldr r5,[$inp,#240] @ pass rounds + mov r4,$inp @ pass key + mov r12,$out @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_enc_key_convert,.-bsaes_enc_key_convert + +.globl bsaes_encrypt_128 +.type bsaes_encrypt_128,%function +.align 4 +bsaes_encrypt_128: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so +.Lenc128_loop: + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! + mov r4,$key @ pass the key + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5,#10 @ pass rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + + bl _bsaes_encrypt8 + + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + subs $len,$len,#0x80 + vst1.8 {@XMM[5]}, [$out]! + bhi .Lenc128_loop + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_encrypt_128,.-bsaes_encrypt_128 + +.globl bsaes_dec_key_convert +.type bsaes_dec_key_convert,%function +.align 4 +bsaes_dec_key_convert: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + + ldr r5,[$inp,#240] @ pass rounds + mov r4,$inp @ pass key + mov r12,$out @ pass key schedule + bl _bsaes_key_convert + vldmia $out, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia $out, {@XMM[7]} + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_dec_key_convert,.-bsaes_dec_key_convert + +.globl bsaes_decrypt_128 +.type bsaes_decrypt_128,%function +.align 4 +bsaes_decrypt_128: + stmdb sp!,{r4-r6,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so +.Ldec128_loop: + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! + mov r4,$key @ pass the key + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5,#10 @ pass rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + + bl _bsaes_decrypt8 + + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + subs $len,$len,#0x80 + vst1.8 {@XMM[5]}, [$out]! + bhi .Ldec128_loop + + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r6,pc} +.size bsaes_decrypt_128,.-bsaes_decrypt_128 +___ +} +{ +my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); +my ($keysched)=("sp"); + +$code.=<<___; +.extern AES_cbc_encrypt +.extern AES_decrypt + +.global bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,%function +.align 5 +bsaes_cbc_encrypt: +#ifndef __KERNEL__ + cmp $len, #128 +#ifndef __thumb__ + blo AES_cbc_encrypt +#else + bhs 1f + b AES_cbc_encrypt +1: +#endif +#endif + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr $ivp, [ip] @ IV is 1st arg on the stack + mov $len, $len, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov $fp, sp @ save sp + + ldr $rounds, [$key, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 @ sp is $keysched + bl _bsaes_key_convert + vldmia $keysched, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia $keysched, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: +#endif + + vld1.8 {@XMM[15]}, [$ivp] @ load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs $len, $len, #0x8 + bmi .Lcbc_dec_loop_finish + + vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input + vld1.8 {@XMM[2]-@XMM[3]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif + vld1.8 {@XMM[4]-@XMM[5]}, [$inp]! + mov r5, $rounds + vld1.8 {@XMM[6]-@XMM[7]}, [$inp] + sub $inp, $inp, #0x60 + vstmia $fp, {@XMM[15]} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[3], @XMM[3], @XMM[13] + vst1.8 {@XMM[6]}, [$out]! + veor @XMM[5], @XMM[5], @XMM[14] + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + vst1.8 {@XMM[5]}, [$out]! + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds $len, $len, #8 + beq .Lcbc_dec_done + + vld1.8 {@XMM[0]}, [$inp]! @ load input + cmp $len, #2 + blo .Lcbc_dec_one + vld1.8 {@XMM[1]}, [$inp]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, $keysched @ pass the key +#else + add r4, $key, #248 +#endif + mov r5, $rounds + vstmia $fp, {@XMM[15]} @ put aside IV + beq .Lcbc_dec_two + vld1.8 {@XMM[2]}, [$inp]! + cmp $len, #4 + blo .Lcbc_dec_three + vld1.8 {@XMM[3]}, [$inp]! + beq .Lcbc_dec_four + vld1.8 {@XMM[4]}, [$inp]! + cmp $len, #6 + blo .Lcbc_dec_five + vld1.8 {@XMM[5]}, [$inp]! + beq .Lcbc_dec_six + vld1.8 {@XMM[6]}, [$inp]! + sub $inp, $inp, #0x70 + + bl _bsaes_decrypt8 + + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[3], @XMM[3], @XMM[13] + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + vst1.8 {@XMM[3]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub $inp, $inp, #0x60 + bl _bsaes_decrypt8 + vldmia $fp,{@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[12]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + veor @XMM[2], @XMM[2], @XMM[11] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[7], @XMM[7], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + vst1.8 {@XMM[7]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub $inp, $inp, #0x50 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[2], @XMM[2], @XMM[11] + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + vst1.8 {@XMM[2]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub $inp, $inp, #0x40 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[10]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[4], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + vst1.8 {@XMM[4]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub $inp, $inp, #0x30 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[15]}, [$inp]! + veor @XMM[1], @XMM[1], @XMM[8] + veor @XMM[6], @XMM[6], @XMM[9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + vst1.8 {@XMM[6]}, [$out]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub $inp, $inp, #0x20 + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[8]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vld1.8 {@XMM[15]}, [$inp]! @ reload input + veor @XMM[1], @XMM[1], @XMM[8] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub $inp, $inp, #0x10 + mov $rounds, $out @ save original out pointer + mov $out, $fp @ use the iv scratch space as out buffer + mov r2, $key + vmov @XMM[4],@XMM[15] @ just in case ensure that IV + vmov @XMM[5],@XMM[0] @ and input are preserved + bl AES_decrypt + vld1.8 {@XMM[0]}, [$fp,:64] @ load result + veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV + vmov @XMM[15], @XMM[5] @ @XMM[5] holds input + vst1.8 {@XMM[0]}, [$rounds] @ write output + +.Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +.Lcbc_dec_bzero: @ wipe key schedule [if any] + vstmia $keysched!, {q0-q1} + cmp $keysched, $fp + bne .Lcbc_dec_bzero +#endif + + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + vst1.8 {@XMM[15]}, [$ivp] @ return IV + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +___ +} +{ +my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10))); +my $const = "r6"; # shared with _bsaes_encrypt8_alt +my $keysched = "sp"; + +$code.=<<___; +.extern AES_encrypt +.global bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + cmp $len, #8 @ use plain AES for + blo .Lctr_enc_short @ small sizes + + mov ip, sp + stmdb sp!, {r4-r10, lr} + VFP_ABI_PUSH + ldr $ctr, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov $fp, sp @ save sp + + ldr $rounds, [$key, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + add r12, #`128-32` @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 @ sp is $keysched + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + + vld1.8 {@XMM[0]}, [$ctr] @ load counter + add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr + vldmia $keysched, {@XMM[4]} @ load round0 key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key + +.align 2 +0: add r12, $key, #248 + vld1.8 {@XMM[0]}, [$ctr] @ load counter + adrl $ctr, .LREVM0SR @ borrow $ctr + vldmia r12, {@XMM[4]} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 @XMM[8],#1 @ compose 1<<96 + veor @XMM[9],@XMM[9],@XMM[9] + vrev32.8 @XMM[0],@XMM[0] + vext.8 @XMM[8],@XMM[9],@XMM[8],#4 + vrev32.8 @XMM[4],@XMM[4] + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vstmia $keysched, {@XMM[4]} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96 + vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1 + vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2 + vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3 + vadd.u32 @XMM[4], @XMM[1], @XMM[10] + vadd.u32 @XMM[5], @XMM[2], @XMM[10] + vadd.u32 @XMM[6], @XMM[3], @XMM[10] + vadd.u32 @XMM[7], @XMM[4], @XMM[10] + vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia $keysched, {@XMM[9]} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, $keysched, #0x10 @ pass next round key +#else + add r4, $key, #`248+16` +#endif + vldmia $ctr, {@XMM[8]} @ .LREVM0SR + mov r5, $rounds @ pass rounds + vstmia $fp, {@XMM[10]} @ save next counter + sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants + + bl _bsaes_encrypt8_alt + + subs $len, $len, #8 + blo .Lctr_enc_loop_done + + vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input + vld1.8 {@XMM[10]-@XMM[11]}, [$inp]! + veor @XMM[0], @XMM[8] + veor @XMM[1], @XMM[9] + vld1.8 {@XMM[12]-@XMM[13]}, [$inp]! + veor @XMM[4], @XMM[10] + veor @XMM[6], @XMM[11] + vld1.8 {@XMM[14]-@XMM[15]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output + veor @XMM[7], @XMM[13] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[4]}, [$out]! + veor @XMM[5], @XMM[15] + vst1.8 {@XMM[6]}, [$out]! + vmov.i32 @XMM[8], #1 @ compose 1<<96 + vst1.8 {@XMM[3]}, [$out]! + veor @XMM[9], @XMM[9], @XMM[9] + vst1.8 {@XMM[7]}, [$out]! + vext.8 @XMM[8], @XMM[9], @XMM[8], #4 + vst1.8 {@XMM[2]}, [$out]! + vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96 + vst1.8 {@XMM[5]}, [$out]! + vldmia $fp, {@XMM[0]} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add $len, $len, #8 + vld1.8 {@XMM[8]}, [$inp]! @ load input + veor @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! @ write output + cmp $len, #2 + blo .Lctr_enc_done + vld1.8 {@XMM[9]}, [$inp]! + veor @XMM[1], @XMM[9] + vst1.8 {@XMM[1]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[10]}, [$inp]! + veor @XMM[4], @XMM[10] + vst1.8 {@XMM[4]}, [$out]! + cmp $len, #4 + blo .Lctr_enc_done + vld1.8 {@XMM[11]}, [$inp]! + veor @XMM[6], @XMM[11] + vst1.8 {@XMM[6]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[12]}, [$inp]! + veor @XMM[3], @XMM[12] + vst1.8 {@XMM[3]}, [$out]! + cmp $len, #6 + blo .Lctr_enc_done + vld1.8 {@XMM[13]}, [$inp]! + veor @XMM[7], @XMM[13] + vst1.8 {@XMM[7]}, [$out]! + beq .Lctr_enc_done + vld1.8 {@XMM[14]}, [$inp] + veor @XMM[2], @XMM[14] + vst1.8 {@XMM[2]}, [$out]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero: @ wipe key schedule [if any] + vstmia $keysched!, {q0-q1} + cmp $keysched, $fp + bne .Lctr_enc_bzero +#else + vstmia $keysched, {q0-q1} +#endif + + mov sp, $fp + add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.align 4 +.Lctr_enc_short: + ldr ip, [sp] @ ctr pointer is passed on stack + stmdb sp!, {r4-r8, lr} + + mov r4, $inp @ copy arguments + mov r5, $out + mov r6, $len + mov r7, $key + ldr r8, [ip, #12] @ load counter LSW + vld1.8 {@XMM[1]}, [ip] @ load whole counter value +#ifdef __ARMEL__ + rev r8, r8 +#endif + sub sp, sp, #0x10 + vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value + sub sp, sp, #0x10 + +.Lctr_enc_short_loop: + add r0, sp, #0x10 @ input counter value + mov r1, sp @ output on the stack + mov r2, r7 @ key + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [r4]! @ load input + vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter + add r8, r8, #1 +#ifdef __ARMEL__ + rev r0, r8 + str r0, [sp, #0x1c] @ next counter value +#else + str r8, [sp, #0x1c] @ next counter value +#endif + veor @XMM[0],@XMM[0],@XMM[1] + vst1.8 {@XMM[0]}, [r5]! @ store output + subs r6, r6, #1 + bne .Lctr_enc_short_loop + + vmov.i32 q0, #0 + vmov.i32 q1, #0 + vstmia sp!, {q0-q1} + + ldmia sp!, {r4-r8, pc} +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +___ +} +{ +###################################################################### +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3))); +my $const="r6"; # returned by _bsaes_key_convert +my $twmask=@XMM[5]; +my @T=@XMM[6..7]; + +$code.=<<___; +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,%function +.align 4 +bsaes_xts_encrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future $fp + + mov $inp, r0 + mov $out, r1 + mov $len, r2 + mov $key, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0,sp @ pointer to initial tweak +#endif + + ldr $rounds, [$key, #240] @ get # of rounds + mov $fp, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} @ save last round key +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + veor @XMM[7], @XMM[7], @XMM[15] @ fix up last round key + vstmia r12, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + + vld1.8 {@XMM[8]}, [r0] @ initial tweak + adr $magic, .Lxts_magic + + subs $len, #0x80 + blo .Lxts_enc_short + b .Lxts_enc_loop + +.align 4 +.Lxts_enc_loop: + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + vadd.u64 @XMM[8], @XMM[15], @XMM[15] + vst1.64 {@XMM[15]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + veor @XMM[8], @XMM[8], @T[0] + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + veor @XMM[7], @XMM[7], @XMM[15] + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[2], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + veor @XMM[13], @XMM[5], @XMM[15] + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + subs $len, #0x80 + bpl .Lxts_enc_loop + +.Lxts_enc_short: + adds $len, #0x70 + bmi .Lxts_enc_done + + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! + subs $len, #0x10 + bmi .Lxts_enc_`$i-9` +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + sub $len, #0x10 + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vld1.64 {@XMM[14]}, [r0,:128]! + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[2], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + vst1.8 {@XMM[12]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_6: + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak + + veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[5], @XMM[5], @XMM[13] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done + +@ put this in range for both ARM and Thumb mode adr instructions +.align 5 +.Lxts_magic: + .quad 1, 0x87 + +.align 5 +.Lxts_enc_5: + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak + + veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[4], @XMM[4], @XMM[12] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + veor @XMM[10], @XMM[3], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + vst1.8 {@XMM[10]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_4: + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak + + veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[3], @XMM[3], @XMM[11] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[6], @XMM[11] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_3: + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak + + veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[2], @XMM[2], @XMM[10] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + vld1.64 {@XMM[10]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[4], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + vst1.8 {@XMM[8]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_2: + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak + + veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[1], @XMM[1], @XMM[9] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_encrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_enc_done +.align 4 +.Lxts_enc_1: + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! + mov $fp, r4 + + vmov @XMM[8], @XMM[9] @ next round tweak + +.Lxts_enc_done: +#ifndef XTS_CHAIN_TWEAK + adds $len, #0x10 + beq .Lxts_enc_ret + sub r6, $out, #0x10 + +.Lxts_enc_steal: + ldrb r0, [$inp], #1 + ldrb r1, [$out, #-0x10] + strb r0, [$out, #-0x10] + strb r1, [$out], #1 + + subs $len, #1 + bhi .Lxts_enc_steal + + vld1.8 {@XMM[0]}, [r6] + mov r0, sp + veor @XMM[0], @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_encrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [r6] + mov $fp, r4 +#endif + +.Lxts_enc_ret: + bic r0, $fp, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_enc_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_enc_bzero + + mov sp, $fp +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,%function +.align 4 +bsaes_xts_decrypt: + mov ip, sp + stmdb sp!, {r4-r10, lr} @ 0x20 + VFP_ABI_PUSH + mov r6, sp @ future $fp + + mov $inp, r0 + mov $out, r1 + mov $len, r2 + mov $key, r3 + + sub r0, sp, #0x10 @ 0x10 + bic r0, #0xf @ align at 16 bytes + mov sp, r0 + +#ifdef XTS_CHAIN_TWEAK + ldr r0, [ip] @ pointer to input tweak +#else + @ generate initial tweak + ldr r0, [ip, #4] @ iv[] + mov r1, sp + ldr r2, [ip, #0] @ key2 + bl AES_encrypt + mov r0, sp @ pointer to initial tweak +#endif + + ldr $rounds, [$key, #240] @ get # of rounds + mov $fp, r6 +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key + @ add r12, #`128-32` @ size of bit-sliced key schedule + sub r12, #`32+16` @ place for tweak[9] + + @ populate the key schedule + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + mov sp, r12 + add r12, #0x90 @ pass key schedule + bl _bsaes_key_convert + add r4, sp, #0x90 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} +#else + ldr r12, [$key, #244] + eors r12, #1 + beq 0f + + str r12, [$key, #244] + mov r4, $key @ pass key + mov r5, $rounds @ pass # of rounds + add r12, $key, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, $key, #248 + vldmia r4, {@XMM[6]} + vstmia r12, {@XMM[15]} @ save last round key + veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key + vstmia r4, {@XMM[7]} + +.align 2 +0: sub sp, #0x90 @ place for tweak[9] +#endif + vld1.8 {@XMM[8]}, [r0] @ initial tweak + adr $magic, .Lxts_magic + + tst $len, #0xf @ if not multiple of 16 + it ne @ Thumb2 thing, sanity check in ARM + subne $len, #0x10 @ subtract another 16 bytes + subs $len, #0x80 + + blo .Lxts_dec_short + b .Lxts_dec_loop + +.align 4 +.Lxts_dec_loop: + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + vadd.u64 @XMM[8], @XMM[15], @XMM[15] + vst1.64 {@XMM[15]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + veor @XMM[8], @XMM[8], @T[0] + vst1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]-@XMM[7]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + veor @XMM[7], @XMM[7], @XMM[15] + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vld1.64 {@XMM[14]-@XMM[15]}, [r0,:128]! + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[3], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + veor @XMM[13], @XMM[5], @XMM[15] + vst1.8 {@XMM[12]-@XMM[13]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + + subs $len, #0x80 + bpl .Lxts_dec_loop + +.Lxts_dec_short: + adds $len, #0x70 + bmi .Lxts_dec_done + + vldmia $magic, {$twmask} @ load XTS magic + vshr.s64 @T[0], @XMM[8], #63 + mov r0, sp + vand @T[0], @T[0], $twmask +___ +for($i=9;$i<16;$i++) { +$code.=<<___; + vadd.u64 @XMM[$i], @XMM[$i-1], @XMM[$i-1] + vst1.64 {@XMM[$i-1]}, [r0,:128]! + vswp `&Dhi("@T[0]")`,`&Dlo("@T[0]")` + vshr.s64 @T[1], @XMM[$i], #63 + veor @XMM[$i], @XMM[$i], @T[0] + vand @T[1], @T[1], $twmask +___ + @T=reverse(@T); + +$code.=<<___ if ($i>=10); + vld1.8 {@XMM[$i-10]}, [$inp]! + subs $len, #0x10 + bmi .Lxts_dec_`$i-9` +___ +$code.=<<___ if ($i>=11); + veor @XMM[$i-11], @XMM[$i-11], @XMM[$i-3] +___ +} +$code.=<<___; + sub $len, #0x10 + vst1.64 {@XMM[15]}, [r0,:128] @ next round tweak + + vld1.8 {@XMM[6]}, [$inp]! + veor @XMM[5], @XMM[5], @XMM[13] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[6], @XMM[6], @XMM[14] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vld1.64 {@XMM[14]}, [r0,:128]! + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + veor @XMM[12], @XMM[3], @XMM[14] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + vst1.8 {@XMM[12]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_6: + vst1.64 {@XMM[14]}, [r0,:128] @ next round tweak + + veor @XMM[4], @XMM[4], @XMM[12] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[5], @XMM[5], @XMM[13] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]-@XMM[13]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + veor @XMM[11], @XMM[7], @XMM[13] + vst1.8 {@XMM[10]-@XMM[11]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_5: + vst1.64 {@XMM[13]}, [r0,:128] @ next round tweak + + veor @XMM[3], @XMM[3], @XMM[11] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[4], @XMM[4], @XMM[12] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + vld1.64 {@XMM[12]}, [r0,:128]! + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + veor @XMM[10], @XMM[2], @XMM[12] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + vst1.8 {@XMM[10]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_4: + vst1.64 {@XMM[12]}, [r0,:128] @ next round tweak + + veor @XMM[2], @XMM[2], @XMM[10] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[3], @XMM[3], @XMM[11] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[ 8]-@XMM[ 9]}, [r0,:128]! + vld1.64 {@XMM[10]-@XMM[11]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + veor @XMM[9], @XMM[4], @XMM[11] + vst1.8 {@XMM[8]-@XMM[9]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_3: + vst1.64 {@XMM[11]}, [r0,:128] @ next round tweak + + veor @XMM[1], @XMM[1], @XMM[9] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[2], @XMM[2], @XMM[10] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + vld1.64 {@XMM[10]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + veor @XMM[8], @XMM[6], @XMM[10] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + vst1.8 {@XMM[8]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_2: + vst1.64 {@XMM[10]}, [r0,:128] @ next round tweak + + veor @XMM[0], @XMM[0], @XMM[8] +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x90 @ pass key schedule +#else + add r4, $key, #248 @ pass key schedule +#endif + veor @XMM[1], @XMM[1], @XMM[9] + mov r5, $rounds @ pass rounds + mov r0, sp + + bl _bsaes_decrypt8 + + vld1.64 {@XMM[8]-@XMM[9]}, [r0,:128]! + veor @XMM[0], @XMM[0], @XMM[ 8] + veor @XMM[1], @XMM[1], @XMM[ 9] + vst1.8 {@XMM[0]-@XMM[1]}, [$out]! + + vld1.64 {@XMM[8]}, [r0,:128] @ next round tweak + b .Lxts_dec_done +.align 4 +.Lxts_dec_1: + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + mov r5, $magic @ preserve magic + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [$out]! + mov $fp, r4 + mov $magic, r5 + + vmov @XMM[8], @XMM[9] @ next round tweak + +.Lxts_dec_done: +#ifndef XTS_CHAIN_TWEAK + adds $len, #0x10 + beq .Lxts_dec_ret + + @ calculate one round of extra tweak for the stolen ciphertext + vldmia $magic, {$twmask} + vshr.s64 @XMM[6], @XMM[8], #63 + vand @XMM[6], @XMM[6], $twmask + vadd.u64 @XMM[9], @XMM[8], @XMM[8] + vswp `&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")` + veor @XMM[9], @XMM[9], @XMM[6] + + @ perform the final decryption with the last tweak value + vld1.8 {@XMM[0]}, [$inp]! + mov r0, sp + veor @XMM[0], @XMM[0], @XMM[9] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + mov r4, $fp @ preserve fp + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[9] + vst1.8 {@XMM[0]}, [$out] + + mov r6, $out +.Lxts_dec_steal: + ldrb r1, [$out] + ldrb r0, [$inp], #1 + strb r1, [$out, #0x10] + strb r0, [$out], #1 + + subs $len, #1 + bhi .Lxts_dec_steal + + vld1.8 {@XMM[0]}, [r6] + mov r0, sp + veor @XMM[0], @XMM[8] + mov r1, sp + vst1.8 {@XMM[0]}, [sp,:128] + mov r2, $key + + bl AES_decrypt + + vld1.8 {@XMM[0]}, [sp,:128] + veor @XMM[0], @XMM[0], @XMM[8] + vst1.8 {@XMM[0]}, [r6] + mov $fp, r4 +#endif + +.Lxts_dec_ret: + bic r0, $fp, #0xf + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifdef XTS_CHAIN_TWEAK + ldr r1, [$fp, #0x20+VFP_ABI_FRAME] @ chain tweak +#endif +.Lxts_dec_bzero: @ wipe key schedule [if any] + vstmia sp!, {q0-q1} + cmp sp, r0 + bne .Lxts_dec_bzero + + mov sp, $fp +#ifdef XTS_CHAIN_TWEAK + vst1.8 {@XMM[8]}, [r1] +#endif + VFP_ABI_POP + ldmia sp!, {r4-r10, pc} @ return + +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +___ +} +$code.=<<___; +#endif +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +print $code; + +close STDOUT; diff --git a/crypto/Kconfig b/crypto/Kconfig index 69ce573..71f337a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -776,6 +776,22 @@ config CRYPTO_AES_ARM See for more information. +config CRYPTO_AES_ARM_BS + tristate "Bit sliced AES using NEON instructions" + depends on ARM && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES_ARM + select CRYPTO_ABLK_HELPER + help + Use a faster and more secure NEON based implementation of AES in CBC, + CTR and XTS modes + + Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode + and for XTS mode encryption, CBC and XTS mode decryption speedup is + around 25%. (CBC encryption speed is not affected by this driver.) + This implementation does not rely on any lookup tables so it is + believed to be invulnerable to cache timing attacks. + config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" select CRYPTO_ALGAPI -- cgit v0.10.2 From 75c912a367fbdc07f0ef6f5384acd5028beb225e Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 7 Oct 2013 15:43:53 +0100 Subject: ARM: add .gitignore entry for aesbs-core.S This avoids this file being incorrectly added to git. Signed-off-by: Russell King diff --git a/arch/arm/crypto/.gitignore b/arch/arm/crypto/.gitignore new file mode 100644 index 0000000..6231d36 --- /dev/null +++ b/arch/arm/crypto/.gitignore @@ -0,0 +1 @@ +aesbs-core.S -- cgit v0.10.2 From 2dfcb802d6bd54a2353678c6434846d94b058f2c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Oct 2013 13:51:29 +0100 Subject: ARM: perf: fix group validation for mixed software and hardware groups Since software events can always be scheduled, perf allows software and hardware events to be mixed together in the same event group. There are two ways in which this can come about: (1) A SW event is added to a HW group. This validates using the HW PMU of the group leader. (2) A HW event is added to a SW group. This inserts the SW events and the new HW event into a HW context, but the SW event remains the group leader. When validating the latter case, we would ideally compare the PMU of each event in the group with the relevant HW PMU. The problem is, in the face of potentially multiple HW PMUs, we don't have a handle on the relevant structure. Commit 7b9f72c62ed0 ("ARM: perf: clean up event group validation") attempting to resolve this issue, but actually made things *worse* by comparing with the leader PMU. If the leader is a SW event, then we automatically `pass' all the HW events during validation! This patch removes the check against the leader PMU. Whilst this will allow events from multiple HW PMUs to be grouped together, that should probably be dealt with in perf core as the result of a later patch. Acked-by: Mark Rutland Signed-off-by: Will Deacon diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index e186ee1..bc3f2ef 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events, struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); - struct pmu *leader_pmu = event->group_leader->pmu; if (is_software_event(event)) return 1; - if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF) + if (event->state < PERF_EVENT_STATE_OFF) return 1; if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) -- cgit v0.10.2 From ca5a45c06cd4764fb8510740f7fc550d9a0208d4 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 31 Jul 2013 12:44:41 -0400 Subject: ARM: mm: use phys_addr_t appropriately in p2v and v2p conversions Fix remainder types used when converting back and forth between physical and virtual addresses. Cc: Russell King Acked-by: Nicolas Pitre Signed-off-by: Santosh Shilimkar diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index e750a93..c133bd9 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -185,22 +185,32 @@ extern unsigned long __pv_phys_offset; : "=r" (to) \ : "r" (from), "I" (type)) -static inline unsigned long __virt_to_phys(unsigned long x) +static inline phys_addr_t __virt_to_phys(unsigned long x) { unsigned long t; __pv_stub(x, t, "add", __PV_BITS_31_24); return t; } -static inline unsigned long __phys_to_virt(unsigned long x) +static inline unsigned long __phys_to_virt(phys_addr_t x) { unsigned long t; __pv_stub(x, t, "sub", __PV_BITS_31_24); return t; } + #else -#define __virt_to_phys(x) ((x) - PAGE_OFFSET + PHYS_OFFSET) -#define __phys_to_virt(x) ((x) - PHYS_OFFSET + PAGE_OFFSET) + +static inline phys_addr_t __virt_to_phys(unsigned long x) +{ + return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET; +} + +static inline unsigned long __phys_to_virt(phys_addr_t x) +{ + return x - PHYS_OFFSET + PAGE_OFFSET; +} + #endif #endif #endif /* __ASSEMBLY__ */ @@ -238,14 +248,14 @@ static inline phys_addr_t virt_to_phys(const volatile void *x) static inline void *phys_to_virt(phys_addr_t x) { - return (void *)(__phys_to_virt((unsigned long)(x))); + return (void *)__phys_to_virt(x); } /* * Drivers should NOT use these either. */ #define __pa(x) __virt_to_phys((unsigned long)(x)) -#define __va(x) ((void *)__phys_to_virt((unsigned long)(x))) +#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x))) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) /* -- cgit v0.10.2 From 4dc9a81715973cb137a14399420bb35b0ed7d6ef Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 31 Jul 2013 12:44:42 -0400 Subject: ARM: mm: Introduce virt_to_idmap() with an arch hook On some PAE systems (e.g. TI Keystone), memory is above the 32-bit addressable limit, and the interconnect provides an aliased view of parts of physical memory in the 32-bit addressable space. This alias is strictly for boot time usage, and is not otherwise usable because of coherency limitations. On such systems, the idmap mechanism needs to take this aliased mapping into account. This patch introduces virt_to_idmap() and a arch function pointer which can be populated by platform which needs it. Also populate necessary idmap spots with now available virt_to_idmap(). Avoided #ifdef approach to be compatible with multi-platform builds. Most architecture won't touch it and in that case virt_to_idmap() fall-back to existing virt_to_phys() macro. Cc: Russell King Acked-by: Nicolas Pitre Signed-off-by: Santosh Shilimkar diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index c133bd9..d9b96c65 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -173,6 +173,7 @@ */ #define __PV_BITS_31_24 0x81000000 +extern phys_addr_t (*arch_virt_to_idmap) (unsigned long x); extern unsigned long __pv_phys_offset; #define PHYS_OFFSET __pv_phys_offset @@ -259,6 +260,21 @@ static inline void *phys_to_virt(phys_addr_t x) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) /* + * These are for systems that have a hardware interconnect supported alias of + * physical memory for idmap purposes. Most cases should leave these + * untouched. + */ +static inline phys_addr_t __virt_to_idmap(unsigned long x) +{ + if (arch_virt_to_idmap) + return arch_virt_to_idmap(x); + else + return __virt_to_phys(x); +} + +#define virt_to_idmap(x) __virt_to_idmap((unsigned long)(x)) + +/* * Virtual <-> DMA view memory address translations * Again, these are *only* valid on the kernel direct mapped RAM * memory. Use of these is *deprecated* (and that doesn't mean diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 72024ea..a0eb830 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -80,7 +80,7 @@ void __init smp_set_ops(struct smp_operations *ops) static unsigned long get_arch_pgd(pgd_t *pgd) { - phys_addr_t pgdir = virt_to_phys(pgd); + phys_addr_t pgdir = virt_to_idmap(pgd); BUG_ON(pgdir & ARCH_PGD_MASK); return pgdir >> ARCH_PGD_SHIFT; } diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index 83cb3ac..c0a1e48 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -10,6 +10,7 @@ #include pgd_t *idmap_pgd; +phys_addr_t (*arch_virt_to_idmap) (unsigned long x); #ifdef CONFIG_ARM_LPAE static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end, @@ -67,8 +68,8 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start, unsigned long addr, end; unsigned long next; - addr = virt_to_phys(text_start); - end = virt_to_phys(text_end); + addr = virt_to_idmap(text_start); + end = virt_to_idmap(text_end); prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; -- cgit v0.10.2 From c1a5f4f6733e39c3b780ff14ad6a68252475e7a9 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 31 Jul 2013 12:44:43 -0400 Subject: ARM: mm: Move the idmap print to appropriate place in the code Commit 9e9a367c29cebd2 {ARM: Section based HYP idmap} moved the address conversion inside identity_mapping_add() without respective print which carries useful idmap information. Move the print as well inside identity_mapping_add() to fix the same. Cc: Will Deacon Cc: Nicolas Pitre Cc: Russell King Signed-off-by: Santosh Shilimkar diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c index c0a1e48..8e0e52e 100644 --- a/arch/arm/mm/idmap.c +++ b/arch/arm/mm/idmap.c @@ -70,6 +70,7 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start, addr = virt_to_idmap(text_start); end = virt_to_idmap(text_end); + pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end); prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF; @@ -91,8 +92,6 @@ static int __init init_static_idmap(void) if (!idmap_pgd) return -ENOMEM; - pr_info("Setting up static identity map for 0x%p - 0x%p\n", - __idmap_text_start, __idmap_text_end); identity_mapping_add(idmap_pgd, __idmap_text_start, __idmap_text_end, 0); -- cgit v0.10.2 From f52bb722547f43caeaecbcc62db9f3c3b80ead9b Mon Sep 17 00:00:00 2001 From: Sricharan R Date: Mon, 29 Jul 2013 20:26:22 +0530 Subject: ARM: mm: Correct virt_to_phys patching for 64 bit physical addresses The current phys_to_virt patching mechanism works only for 32 bit physical addresses and this patch extends the idea for 64bit physical addresses. The 64bit v2p patching mechanism patches the higher 8 bits of physical address with a constant using 'mov' instruction and lower 32bits are patched using 'add'. While this is correct, in those platforms where the lowmem addressable physical memory spawns across 4GB boundary, a carry bit can be produced as a result of addition of lower 32bits. This has to be taken in to account and added in to the upper. The patched __pv_offset and va are added in lower 32bits, where __pv_offset can be in two's complement form when PA_START < VA_START and that can result in a false carry bit. e.g 1) PA = 0x80000000; VA = 0xC0000000 __pv_offset = PA - VA = 0xC0000000 (2's complement) 2) PA = 0x2 80000000; VA = 0xC000000 __pv_offset = PA - VA = 0x1 C0000000 So adding __pv_offset + VA should never result in a true overflow for (1). So in order to differentiate between a true carry, a __pv_offset is extended to 64bit and the upper 32bits will have 0xffffffff if __pv_offset is 2's complement. So 'mvn #0' is inserted instead of 'mov' while patching for the same reason. Since mov, add, sub instruction are to patched with different constants inside the same stub, the rotation field of the opcode is using to differentiate between them. So the above examples for v2p translation becomes for VA=0xC0000000, 1) PA[63:32] = 0xffffffff PA[31:0] = VA + 0xC0000000 --> results in a carry PA[63:32] = PA[63:32] + carry PA[63:0] = 0x0 80000000 2) PA[63:32] = 0x1 PA[31:0] = VA + 0xC0000000 --> results in a carry PA[63:32] = PA[63:32] + carry PA[63:0] = 0x2 80000000 The above ideas were suggested by Nicolas Pitre as part of the review of first and second versions of the subject patch. There is no corresponding change on the phys_to_virt() side, because computations on the upper 32-bits would be discarded anyway. Cc: Russell King Reviewed-by: Nicolas Pitre Signed-off-by: Sricharan R Signed-off-by: Santosh Shilimkar diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index d9b96c65..6748d62 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -172,9 +172,14 @@ * so that all we need to do is modify the 8-bit constant field. */ #define __PV_BITS_31_24 0x81000000 +#define __PV_BITS_7_0 0x81 extern phys_addr_t (*arch_virt_to_idmap) (unsigned long x); -extern unsigned long __pv_phys_offset; +extern u64 __pv_phys_offset; +extern u64 __pv_offset; +extern void fixup_pv_table(const void *, unsigned long); +extern const void *__pv_table_begin, *__pv_table_end; + #define PHYS_OFFSET __pv_phys_offset #define __pv_stub(from,to,instr,type) \ @@ -186,10 +191,36 @@ extern unsigned long __pv_phys_offset; : "=r" (to) \ : "r" (from), "I" (type)) +#define __pv_stub_mov_hi(t) \ + __asm__ volatile("@ __pv_stub_mov\n" \ + "1: mov %R0, %1\n" \ + " .pushsection .pv_table,\"a\"\n" \ + " .long 1b\n" \ + " .popsection\n" \ + : "=r" (t) \ + : "I" (__PV_BITS_7_0)) + +#define __pv_add_carry_stub(x, y) \ + __asm__ volatile("@ __pv_add_carry_stub\n" \ + "1: adds %Q0, %1, %2\n" \ + " adc %R0, %R0, #0\n" \ + " .pushsection .pv_table,\"a\"\n" \ + " .long 1b\n" \ + " .popsection\n" \ + : "+r" (y) \ + : "r" (x), "I" (__PV_BITS_31_24) \ + : "cc") + static inline phys_addr_t __virt_to_phys(unsigned long x) { - unsigned long t; - __pv_stub(x, t, "add", __PV_BITS_31_24); + phys_addr_t t; + + if (sizeof(phys_addr_t) == 4) { + __pv_stub(x, t, "add", __PV_BITS_31_24); + } else { + __pv_stub_mov_hi(t); + __pv_add_carry_stub(x, t); + } return t; } diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 60d3b73..1f031dd 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc); #ifdef CONFIG_ARM_PATCH_PHYS_VIRT EXPORT_SYMBOL(__pv_phys_offset); +EXPORT_SYMBOL(__pv_offset); #endif diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 2c7cc1e..5454794 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -536,6 +536,14 @@ ENTRY(fixup_smp) ldmfd sp!, {r4 - r6, pc} ENDPROC(fixup_smp) +#ifdef __ARMEB_ +#define LOW_OFFSET 0x4 +#define HIGH_OFFSET 0x0 +#else +#define LOW_OFFSET 0x0 +#define HIGH_OFFSET 0x4 +#endif + #ifdef CONFIG_ARM_PATCH_PHYS_VIRT /* __fixup_pv_table - patch the stub instructions with the delta between @@ -546,17 +554,20 @@ ENDPROC(fixup_smp) __HEAD __fixup_pv_table: adr r0, 1f - ldmia r0, {r3-r5, r7} - sub r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET + ldmia r0, {r3-r7} + mvn ip, #0 + subs r3, r0, r3 @ PHYS_OFFSET - PAGE_OFFSET add r4, r4, r3 @ adjust table start address add r5, r5, r3 @ adjust table end address - add r7, r7, r3 @ adjust __pv_phys_offset address - str r8, [r7] @ save computed PHYS_OFFSET to __pv_phys_offset + add r6, r6, r3 @ adjust __pv_phys_offset address + add r7, r7, r3 @ adjust __pv_offset address + str r8, [r6, #LOW_OFFSET] @ save computed PHYS_OFFSET to __pv_phys_offset + strcc ip, [r7, #HIGH_OFFSET] @ save to __pv_offset high bits mov r6, r3, lsr #24 @ constant for add/sub instructions teq r3, r6, lsl #24 @ must be 16MiB aligned THUMB( it ne @ cross section branch ) bne __error - str r6, [r7, #4] @ save to __pv_offset + str r3, [r7, #LOW_OFFSET] @ save to __pv_offset low bits b __fixup_a_pv_table ENDPROC(__fixup_pv_table) @@ -565,10 +576,19 @@ ENDPROC(__fixup_pv_table) .long __pv_table_begin .long __pv_table_end 2: .long __pv_phys_offset + .long __pv_offset .text __fixup_a_pv_table: + adr r0, 3f + ldr r6, [r0] + add r6, r6, r3 + ldr r0, [r6, #HIGH_OFFSET] @ pv_offset high word + ldr r6, [r6, #LOW_OFFSET] @ pv_offset low word + mov r6, r6, lsr #24 + cmn r0, #1 #ifdef CONFIG_THUMB2_KERNEL + moveq r0, #0x200000 @ set bit 21, mov to mvn instruction lsls r6, #24 beq 2f clz r7, r6 @@ -582,18 +602,28 @@ __fixup_a_pv_table: b 2f 1: add r7, r3 ldrh ip, [r7, #2] - and ip, 0x8f00 - orr ip, r6 @ mask in offset bits 31-24 + tst ip, #0x4000 + and ip, #0x8f00 + orrne ip, r6 @ mask in offset bits 31-24 + orreq ip, r0 @ mask in offset bits 7-0 strh ip, [r7, #2] + ldrheq ip, [r7] + biceq ip, #0x20 + orreq ip, ip, r0, lsr #16 + strheq ip, [r7] 2: cmp r4, r5 ldrcc r7, [r4], #4 @ use branch for delay slot bcc 1b bx lr #else + moveq r0, #0x400000 @ set bit 22, mov to mvn instruction b 2f 1: ldr ip, [r7, r3] bic ip, ip, #0x000000ff - orr ip, ip, r6 @ mask in offset bits 31-24 + tst ip, #0xf00 @ check the rotation field + orrne ip, ip, r6 @ mask in offset bits 31-24 + biceq ip, ip, #0x400000 @ clear bit 22 + orreq ip, ip, r0 @ mask in offset bits 7-0 str ip, [r7, r3] 2: cmp r4, r5 ldrcc r7, [r4], #4 @ use branch for delay slot @@ -602,28 +632,29 @@ __fixup_a_pv_table: #endif ENDPROC(__fixup_a_pv_table) +3: .long __pv_offset + ENTRY(fixup_pv_table) stmfd sp!, {r4 - r7, lr} - ldr r2, 2f @ get address of __pv_phys_offset mov r3, #0 @ no offset mov r4, r0 @ r0 = table start add r5, r0, r1 @ r1 = table size - ldr r6, [r2, #4] @ get __pv_offset bl __fixup_a_pv_table ldmfd sp!, {r4 - r7, pc} ENDPROC(fixup_pv_table) - .align -2: .long __pv_phys_offset - .data .globl __pv_phys_offset .type __pv_phys_offset, %object __pv_phys_offset: - .long 0 - .size __pv_phys_offset, . - __pv_phys_offset + .quad 0 + .size __pv_phys_offset, . -__pv_phys_offset + + .globl __pv_offset + .type __pv_offset, %object __pv_offset: - .long 0 + .quad 0 + .size __pv_offset, . -__pv_offset #endif #include "head-common.S" -- cgit v0.10.2 From a77e0c7b2774fd52ce6bf25c2c3ffdccb7b110ff Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 31 Jul 2013 12:44:46 -0400 Subject: ARM: mm: Recreate kernel mappings in early_paging_init() This patch adds a step in the init sequence, in order to recreate the kernel code/data page table mappings prior to full paging initialization. This is necessary on LPAE systems that run out of a physical address space outside the 4G limit. On these systems, this implementation provides a machine descriptor hook that allows the PHYS_OFFSET to be overridden in a machine specific fashion. Cc: Russell King Acked-by: Nicolas Pitre Signed-off-by: R Sricharan Signed-off-by: Santosh Shilimkar diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index 402a2bc..17a3fa2 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -49,6 +49,7 @@ struct machine_desc { bool (*smp_init)(void); void (*fixup)(struct tag *, char **, struct meminfo *); + void (*init_meminfo)(void); void (*reserve)(void);/* reserve mem blocks */ void (*map_io)(void);/* IO mapping function */ void (*init_early)(void); diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 0e1e2b3..af7b7db 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup); #endif extern void paging_init(const struct machine_desc *desc); +extern void early_paging_init(const struct machine_desc *, + struct proc_info_list *); extern void sanity_check_meminfo(void); extern enum reboot_mode reboot_mode; extern void setup_dma_zone(const struct machine_desc *desc); @@ -878,6 +880,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); + + early_paging_init(mdesc, lookup_processor_type(read_cpuid_id())); sanity_check_meminfo(); arm_memblock_init(&meminfo, mdesc); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index b1d17ee..78eeeca 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -1315,6 +1317,86 @@ static void __init map_lowmem(void) } } +#ifdef CONFIG_ARM_LPAE +/* + * early_paging_init() recreates boot time page table setup, allowing machines + * to switch over to a high (>4G) address space on LPAE systems + */ +void __init early_paging_init(const struct machine_desc *mdesc, + struct proc_info_list *procinfo) +{ + pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags; + unsigned long map_start, map_end; + pgd_t *pgd0, *pgdk; + pud_t *pud0, *pudk, *pud_start; + pmd_t *pmd0, *pmdk; + phys_addr_t phys; + int i; + + if (!(mdesc->init_meminfo)) + return; + + /* remap kernel code and data */ + map_start = init_mm.start_code; + map_end = init_mm.brk; + + /* get a handle on things... */ + pgd0 = pgd_offset_k(0); + pud_start = pud0 = pud_offset(pgd0, 0); + pmd0 = pmd_offset(pud0, 0); + + pgdk = pgd_offset_k(map_start); + pudk = pud_offset(pgdk, map_start); + pmdk = pmd_offset(pudk, map_start); + + mdesc->init_meminfo(); + + /* Run the patch stub to update the constants */ + fixup_pv_table(&__pv_table_begin, + (&__pv_table_end - &__pv_table_begin) << 2); + + /* + * Cache cleaning operations for self-modifying code + * We should clean the entries by MVA but running a + * for loop over every pv_table entry pointer would + * just complicate the code. + */ + flush_cache_louis(); + dsb(); + isb(); + + /* remap level 1 table */ + for (i = 0; i < PTRS_PER_PGD; pud0++, i++) { + set_pud(pud0, + __pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER)); + pmd0 += PTRS_PER_PMD; + } + + /* remap pmds for kernel mapping */ + phys = __pa(map_start) & PMD_MASK; + do { + *pmdk++ = __pmd(phys | pmdprot); + phys += PMD_SIZE; + } while (phys < map_end); + + flush_cache_all(); + cpu_switch_mm(pgd0, &init_mm); + cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET); + local_flush_bp_all(); + local_flush_tlb_all(); +} + +#else + +void __init early_paging_init(const struct machine_desc *mdesc, + struct proc_info_list *procinfo) +{ + if (mdesc->init_meminfo) + mdesc->init_meminfo(); +} + +#endif + /* * paging_init() sets up the page tables, initialises the zone memory * maps, and sets up the zero page, bad page and bad page tables. -- cgit v0.10.2 From d10d2d485497cdc62a7660cd981f8f1ae0dffe7d Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Feb 2013 09:41:37 +0000 Subject: ARM: fix ARCH_IXP4xx usage of ARCH_SUPPORTS_BIG_ENDIAN The Kconfig for arch/arm/mach-ixp4xx has a local definition of ARCH_SUPPORTS_BIG_ENDIAN which could be used elsewhere. This means that if IXP4xx is selected and this symbol is selected eleswhere then an warning is produced. Clean the following error up by making the symbol be selected by the main ARCH_IXP4XX definition and have a common definition in arch/arm/mm/Kconfig warning: (ARCH_xxx) selects ARCH_SUPPORTS_BIG_ENDIAN which has unmet direct dependencies (ARCH_IXP4XX) warning: (ARCH_xxx) selects ARCH_SUPPORTS_BIG_ENDIAN which has unmet direct dependencies (ARCH_IXP4XX) Signed-off-by: Ben Dooks diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1ad6fb6..f048e14 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -481,6 +481,7 @@ config ARCH_IXP4XX bool "IXP4xx-based" depends on MMU select ARCH_HAS_DMA_SET_COHERENT_MASK + select ARCH_SUPPORTS_BIG_ENDIAN select ARCH_REQUIRE_GPIOLIB select CLKSRC_MMIO select CPU_XSCALE diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig index 30e1ebe..c342dc4 100644 --- a/arch/arm/mach-ixp4xx/Kconfig +++ b/arch/arm/mach-ixp4xx/Kconfig @@ -1,9 +1,5 @@ if ARCH_IXP4XX -config ARCH_SUPPORTS_BIG_ENDIAN - bool - default y - menu "Intel IXP4xx Implementation Options" comment "IXP4xx Platforms" diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index cd2c88e..1f8fed9 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS help This option allows the use of custom mandatory barriers included via the mach/barriers.h file. + +config ARCH_SUPPORTS_BIG_ENDIAN + bool + help + This option specifies the architecture can support big endian + operation. -- cgit v0.10.2 From 457c2403c513c74f60d5757fd11ae927e5554a38 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 12 Feb 2013 18:59:57 +0000 Subject: ARM: asm: Add ARM_BE8() assembly helper Add ARM_BE8() helper to wrap any code conditional on being compile when CONFIG_ARM_ENDIAN_BE8 is selected and convert existing places where this is to use it. Acked-by: Nicolas Pitre Reviewed-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 75189f1..c912c2a 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -699,9 +699,7 @@ __armv4_mmu_cache_on: mrc p15, 0, r0, c1, c0, 0 @ read control reg orr r0, r0, #0x5000 @ I-cache enable, RR cache replacement orr r0, r0, #0x0030 -#ifdef CONFIG_CPU_ENDIAN_BE8 - orr r0, r0, #1 << 25 @ big-endian page tables -#endif + ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables bl __common_mmu_cache_on mov r0, #0 mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs @@ -728,9 +726,7 @@ __armv7_mmu_cache_on: orr r0, r0, #1 << 22 @ U (v6 unaligned access model) @ (needed for ARM1176) #ifdef CONFIG_MMU -#ifdef CONFIG_CPU_ENDIAN_BE8 - orr r0, r0, #1 << 25 @ big-endian page tables -#endif + ARM_BE8( orr r0, r0, #1 << 25 ) @ big-endian page tables mrcne p15, 0, r6, c2, c0, 2 @ read ttb control reg orrne r0, r0, #1 @ MMU enabled movne r1, #0xfffffffd @ domain 0 = client diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index fcc1b5b..5c22851 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -53,6 +53,13 @@ #define put_byte_3 lsl #0 #endif +/* Select code for any configuration running in BE8 mode */ +#ifdef CONFIG_CPU_ENDIAN_BE8 +#define ARM_BE8(code...) code +#else +#define ARM_BE8(code...) +#endif + /* * Data preload for architectures that support it */ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9cbe70c..55090fb 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -416,9 +416,8 @@ __und_usr: bne __und_usr_thumb sub r4, r2, #4 @ ARM instr at LR - 4 1: ldrt r0, [r4] -#ifdef CONFIG_CPU_ENDIAN_BE8 - rev r0, r0 @ little endian instruction -#endif + ARM_BE8(rev r0, r0) @ little endian instruction + @ r0 = 32-bit ARM instruction which caused the exception @ r2 = PC value for the following instruction (:= regs->ARM_pc) @ r4 = PC value for the faulting instruction diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index bc6bd96..a2dcafd 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -393,9 +393,7 @@ ENTRY(vector_swi) #else USER( ldr r10, [lr, #-4] ) @ get SWI instruction #endif -#ifdef CONFIG_CPU_ENDIAN_BE8 - rev r10, r10 @ little endian instruction -#endif + ARM_BE8(rev r10, r10) @ little endian instruction #elif defined(CONFIG_AEABI) diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S index 8074199..3815a82 100644 --- a/arch/arm/mm/abort-ev6.S +++ b/arch/arm/mm/abort-ev6.S @@ -38,9 +38,8 @@ ENTRY(v6_early_abort) bne do_DataAbort bic r1, r1, #1 << 11 @ clear bit 11 of FSR ldr r3, [r4] @ read aborted ARM instruction -#ifdef CONFIG_CPU_ENDIAN_BE8 - rev r3, r3 -#endif + ARM_BE8(rev r3, r3) + do_ldrd_abort tmp=ip, insn=r3 tst r3, #1 << 20 @ L = 0 -> write orreq r1, r1, #1 << 11 @ yes. diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S index 1128064..45dc29f 100644 --- a/arch/arm/mm/proc-v6.S +++ b/arch/arm/mm/proc-v6.S @@ -220,9 +220,7 @@ __v6_setup: #endif /* CONFIG_MMU */ adr r5, v6_crval ldmia r5, {r5, r6} -#ifdef CONFIG_CPU_ENDIAN_BE8 - orr r6, r6, #1 << 25 @ big-endian page tables -#endif + ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables mrc p15, 0, r0, c1, c0, 0 @ read control register bic r0, r0, r5 @ clear bits them orr r0, r0, r6 @ set them diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index c63d9bd..60920f6 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -367,9 +367,7 @@ __v7_setup: #endif adr r5, v7_crval ldmia r5, {r5, r6} -#ifdef CONFIG_CPU_ENDIAN_BE8 - orr r6, r6, #1 << 25 @ big-endian page tables -#endif + ARM_BE8(orr r6, r6, #1 << 25) @ big-endian page tables #ifdef CONFIG_SWP_EMULATE orr r5, r5, #(1 << 10) @ set SW bit in "clear" bic r6, r6, #(1 << 10) @ clear it in "mmuset" -- cgit v0.10.2 From 2f9bf9beddb1649485b47302a5aba9761cbc9084 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Feb 2013 16:23:08 +0100 Subject: ARM: fixup_pv_table bug when CPU_ENDIAN_BE8 The fixup_pv_table assumes that the instructions are in the same endian configuration as the data, but when the CPU is running in BE8 the instructions stay in little-endian format. Make sure if CONFIG_CPU_ENDIAN_BE8 is set that we do all the alterations to the instructions taking in to account the LDR/STR will be swapping the data endian-ness. Since the code is only modifying a byte, we avoid dual-swapping the data, and just change the bits we clear and ORR in (in the case where the code is not thumb2). For thumb2, we add the necessary rev16 instructions to ensure that the instructions are processed in the correct format, as it was easier than re-writing the code to contain a mask and shift. Signed-off-by: Ben Dooks Reviewed-by: Dave Martin Tested-by: Thomas Petazzoni diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 2c7cc1e..9e5906c 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -582,8 +582,10 @@ __fixup_a_pv_table: b 2f 1: add r7, r3 ldrh ip, [r7, #2] +ARM_BE8(rev16 ip, ip) and ip, 0x8f00 orr ip, r6 @ mask in offset bits 31-24 +ARM_BE8(rev16 ip, ip) strh ip, [r7, #2] 2: cmp r4, r5 ldrcc r7, [r4], #4 @ use branch for delay slot @@ -592,8 +594,14 @@ __fixup_a_pv_table: #else b 2f 1: ldr ip, [r7, r3] +#ifdef CONFIG_CPU_ENDIAN_BE8 + @ in BE8, we load data in BE, but instructions still in LE + bic ip, ip, #0xff000000 + orr ip, ip, r6, lsl#24 +#else bic ip, ip, #0x000000ff orr ip, ip, r6 @ mask in offset bits 31-24 +#endif str ip, [r7, r3] 2: cmp r4, r5 ldrcc r7, [r4], #4 @ use branch for delay slot -- cgit v0.10.2 From 97bcb0fea590d3d704f985bec08f342d28992634 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Feb 2013 09:40:42 +0000 Subject: ARM: set BE8 if LE in head code If we are booting in LE and compiled for BE8, then add code to set the state to bE8. Since the instruction stream is always LE, we do not need to do anything special to the instruction. Also ensure that the secondary processors are started in the same mode. Note, we do add about 20 bytes to the kernel image, but it seems easier to do this than adding another configuration to change. Signed-off-by: Ben Dooks Reviewed-by: Dave Martin Tested-by: Thomas Petazzoni diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index c912c2a..066b034 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -135,6 +135,7 @@ start: .word _edata @ zImage end address THUMB( .thumb ) 1: + ARM_BE8( setend be ) @ go BE8 if compiled for BE8 mrs r9, cpsr #ifdef CONFIG_ARM_VIRT_EXT bl __hyp_stub_install @ get into SVC mode, reversibly diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 9e5906c..a047acf 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -77,6 +77,7 @@ __HEAD ENTRY(stext) + ARM_BE8(setend be ) @ ensure we are in BE8 mode THUMB( adr r9, BSYM(1f) ) @ Kernel is always entered in ARM. THUMB( bx r9 ) @ If this is a Thumb-2 kernel, @@ -352,6 +353,9 @@ ENTRY(secondary_startup) * the processor type - there is no need to check the machine type * as it has already been validated by the primary processor. */ + + ARM_BE8(setend be) @ ensure we are in BE8 mode + #ifdef CONFIG_ARM_VIRT_EXT bl __hyp_stub_install_secondary #endif diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index db1536b..716343ca 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S @@ -130,6 +130,7 @@ ENDPROC(cpu_resume_after_mmu) .data .align ENTRY(cpu_resume) +ARM_BE8(setend be) @ ensure we are in BE mode mov r1, #0 ALT_SMP(mrc p15, 0, r0, c0, c0, 5) ALT_UP_B(1f) -- cgit v0.10.2 From 76e3faf156fa95b6465e747d702b94faf67117fc Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 6 Feb 2013 18:25:36 +0000 Subject: ARM: pl01x debug code endian fix The PL01X debug code needs to take into account which endian mode the processor is running in. If it is big-endian, ensure the data is swapped appropriately. Note, we could do this slightly more efficiently if we have an macro to do the necessary swap for the bits used by test. Reviewed-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/include/debug/pl01x.S b/arch/arm/include/debug/pl01x.S index 37c6895b..92ef808 100644 --- a/arch/arm/include/debug/pl01x.S +++ b/arch/arm/include/debug/pl01x.S @@ -25,12 +25,14 @@ .macro waituart,rd,rx 1001: ldr \rd, [\rx, #UART01x_FR] + ARM_BE8( rev \rd, \rd ) tst \rd, #UART01x_FR_TXFF bne 1001b .endm .macro busyuart,rd,rx 1001: ldr \rd, [\rx, #UART01x_FR] + ARM_BE8( rev \rd, \rd ) tst \rd, #UART01x_FR_BUSY bne 1001b .endm -- cgit v0.10.2 From 2e874ea342146130206f8b39f2103f33690a7547 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 6 Feb 2013 18:44:20 +0000 Subject: ARM: twd: data endian fix Ensure the twd driver uses the correct calls to access the hardware to ensure that we do not end up with data in the wrong endian format. Reviewed-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index 2985c9f..6591e26 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE | TWD_TIMER_CONTROL_PERIODIC; - __raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), + writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ), twd_base + TWD_TIMER_LOAD); break; case CLOCK_EVT_MODE_ONESHOT: @@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode, ctrl = 0; } - __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); + writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL); } static int twd_set_next_event(unsigned long evt, struct clock_event_device *unused) { - unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL); + unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL); ctrl |= TWD_TIMER_CONTROL_ENABLE; - __raw_writel(evt, twd_base + TWD_TIMER_COUNTER); - __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL); + writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER); + writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL); return 0; } @@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt, */ static int twd_timer_ack(void) { - if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) { - __raw_writel(1, twd_base + TWD_TIMER_INTSTAT); + if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) { + writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT); return 1; } @@ -211,15 +211,15 @@ static void twd_calibrate_rate(void) waitjiffies += 5; /* enable, no interrupt or reload */ - __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL); + writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL); /* maximum value */ - __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); + writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER); while (get_jiffies_64() < waitjiffies) udelay(10); - count = __raw_readl(twd_base + TWD_TIMER_COUNTER); + count = readl_relaxed(twd_base + TWD_TIMER_COUNTER); twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5); @@ -277,7 +277,7 @@ static void twd_timer_setup(void) * bother with the below. */ if (per_cpu(percpu_setup_called, cpu)) { - __raw_writel(0, twd_base + TWD_TIMER_CONTROL); + writel_relaxed(0, twd_base + TWD_TIMER_CONTROL); clockevents_register_device(clk); enable_percpu_irq(clk->irq, 0); return; @@ -290,7 +290,7 @@ static void twd_timer_setup(void) * The following is done once per CPU the first time .setup() is * called. */ - __raw_writel(0, twd_base + TWD_TIMER_CONTROL); + writel_relaxed(0, twd_base + TWD_TIMER_CONTROL); clk->name = "local_timer"; clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | -- cgit v0.10.2 From 099a4809133dc6548d37cc143ab0cb9c2eba97bb Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 7 Feb 2013 11:14:21 +0000 Subject: ARM: smp_scu: data endian fixes The smp_scu driver needs to use the relaxed readl/write accessors to avoid any issues with the endian mode the processor core is in. Reviewed-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c index 5bc1a63..1aafa0d 100644 --- a/arch/arm/kernel/smp_scu.c +++ b/arch/arm/kernel/smp_scu.c @@ -28,7 +28,7 @@ */ unsigned int __init scu_get_core_count(void __iomem *scu_base) { - unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG); + unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG); return (ncores & 0x03) + 1; } @@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base) #ifdef CONFIG_ARM_ERRATA_764369 /* Cortex-A9 only */ if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) { - scu_ctrl = __raw_readl(scu_base + 0x30); + scu_ctrl = readl_relaxed(scu_base + 0x30); if (!(scu_ctrl & 1)) - __raw_writel(scu_ctrl | 0x1, scu_base + 0x30); + writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30); } #endif - scu_ctrl = __raw_readl(scu_base + SCU_CTRL); + scu_ctrl = readl_relaxed(scu_base + SCU_CTRL); /* already enabled? */ if (scu_ctrl & 1) return; scu_ctrl |= 1; - __raw_writel(scu_ctrl, scu_base + SCU_CTRL); + writel_relaxed(scu_ctrl, scu_base + SCU_CTRL); /* * Ensure that the data accessed by CPU0 before the SCU was @@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode) if (mode > 3 || mode == 1 || cpu > 3) return -EINVAL; - val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; + val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03; val |= mode; - __raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu); + writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu); return 0; } -- cgit v0.10.2 From 50eec2fce45ed48575f1c0582b748e409da08511 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Feb 2013 09:31:34 +0000 Subject: ARM: highbank: enable big-endian Apart from a xgmac driver issue, the highbank seems to work correctly in big-endian mode. Allow the selection of big-endian in the system. Signed-off-by: Ben Dooks Acked-by: Rob Herring diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig index 8e8437d..3c3bff7 100644 --- a/arch/arm/mach-highbank/Kconfig +++ b/arch/arm/mach-highbank/Kconfig @@ -4,6 +4,7 @@ config ARCH_HIGHBANK select ARCH_HAS_CPUFREQ select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_HAS_OPP + select ARCH_SUPPORTS_BIG_ENDIAN select ARCH_WANT_OPTIONAL_GPIOLIB select ARM_AMBA select ARM_ERRATA_764369 -- cgit v0.10.2 From bca028e7c2537fea9f401c20dd7b2103358b5efe Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 1 Feb 2013 10:36:22 +0000 Subject: ARM: mvebu: support running big-endian Add indication we can run these cores in BE mode, and ensure that the secondary CPU is set to big-endian mode in the initialisation code as the initial code runs little-endian. Signed-off-by: Ben Dooks Tested-by: Thomas Petazzoni Acked-by: Jason Cooper diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig index 9eb63d7..5e269d7 100644 --- a/arch/arm/mach-mvebu/Kconfig +++ b/arch/arm/mach-mvebu/Kconfig @@ -1,5 +1,6 @@ config ARCH_MVEBU bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7 + select ARCH_SUPPORTS_BIG_ENDIAN select CLKSRC_MMIO select COMMON_CLK select GENERIC_CLOCKEVENTS diff --git a/arch/arm/mach-mvebu/coherency_ll.S b/arch/arm/mach-mvebu/coherency_ll.S index 5476669..ee7598f 100644 --- a/arch/arm/mach-mvebu/coherency_ll.S +++ b/arch/arm/mach-mvebu/coherency_ll.S @@ -20,6 +20,8 @@ #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0 #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4 +#include + .text /* * r0: Coherency fabric base register address @@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent) /* Create bit by cpu index */ mov r3, #(1 << 24) lsl r1, r3, r1 +ARM_BE8(rev r1, r1) /* Add CPU to SMP group - Atomic */ add r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET diff --git a/arch/arm/mach-mvebu/headsmp.S b/arch/arm/mach-mvebu/headsmp.S index 8a1b0c9..3dd80df 100644 --- a/arch/arm/mach-mvebu/headsmp.S +++ b/arch/arm/mach-mvebu/headsmp.S @@ -21,12 +21,16 @@ #include #include +#include + /* * Armada XP specific entry point for secondary CPUs. * We add the CPU to the coherency fabric and then jump to secondary * startup */ ENTRY(armada_xp_secondary_startup) + ARM_BE8(setend be ) @ go BE8 if entered LE + /* Get coherency fabric base physical address */ adr r0, 1f ldr r1, [r0] -- cgit v0.10.2 From 98dec91fa36a4a74f7c44dd2dfb000203656f4f4 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 28 May 2013 21:34:50 +0100 Subject: ARM: vexpress: add big endian support Add support for the versatile express systems to boot big-endian. Signed-off-by: Ben Dooks diff --git a/arch/arm/mach-vexpress/Kconfig b/arch/arm/mach-vexpress/Kconfig index 3657954..4fe8ebe 100644 --- a/arch/arm/mach-vexpress/Kconfig +++ b/arch/arm/mach-vexpress/Kconfig @@ -1,6 +1,7 @@ config ARCH_VEXPRESS bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7 select ARCH_REQUIRE_GPIOLIB + select ARCH_SUPPORTS_BIG_ENDIAN select ARM_AMBA select ARM_GIC select ARM_TIMER_SP804 diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S index 2677bc3..40f27e5 100644 --- a/arch/arm/plat-versatile/headsmp.S +++ b/arch/arm/plat-versatile/headsmp.S @@ -10,6 +10,7 @@ */ #include #include +#include /* * Realview/Versatile Express specific entry point for secondary CPUs. @@ -17,6 +18,7 @@ * until we're ready for them to initialise. */ ENTRY(versatile_secondary_startup) + ARM_BE8(setend be) mrc p15, 0, r0, c0, c0, 5 bic r0, #0xff000000 adr r4, 1f -- cgit v0.10.2 From 8592edf0dec8159fde379eb7e056eaddbbd697f2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 18 Jul 2013 21:10:56 +0100 Subject: ARM: alignment: correctly decode instructions in BE8 mode. If we are in BE8 mode, we must deal with the instruction stream being in LE order when data is being loaded in BE order. Ensure the data is swapped before processing to avoid thre following: Change to using to provide the necessary conversion functions to change the byte ordering. This stops the following warning messages from the kernel on a fault: Unhandled fault: alignment exception (0x001) at 0xbfa09567 Alignment trap: not handling instruction 030091e8 at [<80333e8c>] Signed-off-by: Ben Dooks Reviewed-by: Dave Martin Tested-by: Thomas Petazzoni diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 6f4585b..9240364 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "fault.h" @@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (thumb_mode(regs)) { u16 *ptr = (u16 *)(instrptr & ~1); fault = probe_kernel_address(ptr, tinstr); + tinstr = __mem_to_opcode_thumb16(tinstr); if (!fault) { if (cpu_architecture() >= CPU_ARCH_ARMv7 && IS_T32(tinstr)) { /* Thumb-2 32-bit */ u16 tinst2 = 0; fault = probe_kernel_address(ptr + 1, tinst2); - instr = (tinstr << 16) | tinst2; + tinst2 = __mem_to_opcode_thumb16(tinst2); + instr = __opcode_thumb32_compose(tinstr, tinst2); thumb2_32b = 1; } else { isize = 2; instr = thumb2arm(tinstr); } } - } else + } else { fault = probe_kernel_address(instrptr, instr); + instr = __mem_to_opcode_arm(instr); + } if (fault) { type = TYPE_FAULT; -- cgit v0.10.2 From a79a0cb1d35ec422dcf493cef1bebf9fdfcfdb9a Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 19 Jul 2013 17:12:05 +0100 Subject: ARM: traps: use to get correct instruction order The trap handler needs to take into account the endian configuration of the system when loading instructions. Use to provide the necessary conversion functions. Signed-off-by: Ben Dooks Tested-by: Thomas Petazzoni diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 8fcda14..caf96da 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -34,6 +34,7 @@ #include #include #include +#include static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" }; @@ -402,25 +403,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs) if (processor_mode(regs) == SVC_MODE) { #ifdef CONFIG_THUMB2_KERNEL if (thumb_mode(regs)) { - instr = ((u16 *)pc)[0]; + instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]); if (is_wide_instruction(instr)) { - instr <<= 16; - instr |= ((u16 *)pc)[1]; + u16 inst2; + inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]); + instr = __opcode_thumb32_compose(instr, inst2); } } else #endif - instr = *(u32 *) pc; + instr = __mem_to_opcode_arm(*(u32 *) pc); } else if (thumb_mode(regs)) { if (get_user(instr, (u16 __user *)pc)) goto die_sig; + instr = __mem_to_opcode_thumb16(instr); if (is_wide_instruction(instr)) { unsigned int instr2; if (get_user(instr2, (u16 __user *)pc+1)) goto die_sig; - instr <<= 16; - instr |= instr2; + instr2 = __mem_to_opcode_thumb16(instr2); + instr = __opcode_thumb32_compose(instr, instr2); } } else if (get_user(instr, (u32 __user *)pc)) { + instr = __mem_to_opcode_arm(instr); goto die_sig; } -- cgit v0.10.2 From f592d323bc2353db871d1e840f05b27e0730fb10 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 19 Jul 2013 18:27:23 +0100 Subject: ARM: module: correctly relocate instructions in BE8 When in BE8 mode, our instructions are not in the same ordering as the data, so use to take this into account. Note, also requires modules to be built --be8 Signed-off-by: Ben Dooks Reviewed-by: Dave Martin diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 084dc88..5fdb403 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_XIP_KERNEL /* @@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, Elf32_Sym *sym; const char *symname; s32 offset; + u32 tmp; #ifdef CONFIG_THUMB2_KERNEL u32 upper, lower, sign, j1, j2; #endif @@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, case R_ARM_PC24: case R_ARM_CALL: case R_ARM_JUMP24: - offset = (*(u32 *)loc & 0x00ffffff) << 2; + offset = __mem_to_opcode_arm(*(u32 *)loc); + offset = (offset & 0x00ffffff) << 2; if (offset & 0x02000000) offset -= 0x04000000; @@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, } offset >>= 2; + offset &= 0x00ffffff; - *(u32 *)loc &= 0xff000000; - *(u32 *)loc |= offset & 0x00ffffff; + *(u32 *)loc &= __opcode_to_mem_arm(0xff000000); + *(u32 *)loc |= __opcode_to_mem_arm(offset); break; case R_ARM_V4BX: @@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, * other bits to re-code instruction as * MOV PC,Rm. */ - *(u32 *)loc &= 0xf000000f; - *(u32 *)loc |= 0x01a0f000; + *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f); + *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000); break; case R_ARM_PREL31: @@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_ABS: - offset = *(u32 *)loc; + offset = tmp = __mem_to_opcode_arm(*(u32 *)loc); offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff); offset = (offset ^ 0x8000) - 0x8000; @@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS) offset >>= 16; - *(u32 *)loc &= 0xfff0f000; - *(u32 *)loc |= ((offset & 0xf000) << 4) | - (offset & 0x0fff); + tmp &= 0xfff0f000; + tmp |= ((offset & 0xf000) << 4) | + (offset & 0x0fff); + + *(u32 *)loc = __opcode_to_mem_arm(tmp); break; #ifdef CONFIG_THUMB2_KERNEL case R_ARM_THM_CALL: case R_ARM_THM_JUMP24: - upper = *(u16 *)loc; - lower = *(u16 *)(loc + 2); + upper = __mem_to_opcode_thumb16(*(u16 *)loc); + lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2)); /* * 25 bit signed address range (Thumb-2 BL and B.W @@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, sign = (offset >> 24) & 1; j1 = sign ^ (~(offset >> 23) & 1); j2 = sign ^ (~(offset >> 22) & 1); - *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) | + upper = (u16)((upper & 0xf800) | (sign << 10) | ((offset >> 12) & 0x03ff)); - *(u16 *)(loc + 2) = (u16)((lower & 0xd000) | - (j1 << 13) | (j2 << 11) | - ((offset >> 1) & 0x07ff)); + lower = (u16)((lower & 0xd000) | + (j1 << 13) | (j2 << 11) | + ((offset >> 1) & 0x07ff)); + + *(u16 *)loc = __opcode_to_mem_thumb16(upper); + *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower); break; case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_ABS: - upper = *(u16 *)loc; - lower = *(u16 *)(loc + 2); + upper = __mem_to_opcode_thumb16(*(u16 *)loc); + lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2)); /* * MOVT/MOVW instructions encoding in Thumb-2: @@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS) offset >>= 16; - *(u16 *)loc = (u16)((upper & 0xfbf0) | - ((offset & 0xf000) >> 12) | - ((offset & 0x0800) >> 1)); - *(u16 *)(loc + 2) = (u16)((lower & 0x8f00) | - ((offset & 0x0700) << 4) | - (offset & 0x00ff)); + upper = (u16)((upper & 0xfbf0) | + ((offset & 0xf000) >> 12) | + ((offset & 0x0800) >> 1)); + lower = (u16)((lower & 0x8f00) | + ((offset & 0x0700) << 4) | + (offset & 0x00ff)); + *(u16 *)loc = __opcode_to_mem_thumb16(upper); + *(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower); break; #endif -- cgit v0.10.2 From 0ab89d0bf8054c3146ec06df357946bb87f36729 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 22 Jul 2013 16:32:19 +0100 Subject: ARM: set --be8 when linking modules To avoid having to make every text section swap the instruction order of all instructions, make sure modules are built also built with --be8 (as is the current kernel final link). If we do not do this, we would end up having to swap all instructions when loading a module, instead of just the instructions that we are applying ELF relocations to. Signed-off-by: Ben Dooks Reviewed-by: Dave Martin diff --git a/arch/arm/Makefile b/arch/arm/Makefile index a37a50f..0069697 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -16,6 +16,7 @@ LDFLAGS := LDFLAGS_vmlinux :=-p --no-undefined -X ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) LDFLAGS_vmlinux += --be8 +LDFLAGS_MODULE += --be8 endif OBJCOPYFLAGS :=-O binary -R .comment -S -- cgit v0.10.2 From bfdef3b32d2f36bf137c039de9a545cdfcfbafe2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 24 Jul 2013 16:09:57 +0100 Subject: ARM: hardware: fix endian-ness in The needs to take into account the endian-ness of the processor when reading and writing data, so change to using the readl/writel relaxed variants from the raw ones. Signed-off-by: Ben Dooks diff --git a/arch/arm/include/asm/hardware/coresight.h b/arch/arm/include/asm/hardware/coresight.h index 0cf7a6b..ad774f3 100644 --- a/arch/arm/include/asm/hardware/coresight.h +++ b/arch/arm/include/asm/hardware/coresight.h @@ -24,8 +24,8 @@ #define TRACER_TIMEOUT 10000 #define etm_writel(t, v, x) \ - (__raw_writel((v), (t)->etm_regs + (x))) -#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x))) + (writel_relaxed((v), (t)->etm_regs + (x))) +#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x))) /* CoreSight Management Registers */ #define CSMR_LOCKACCESS 0xfb0 @@ -142,8 +142,8 @@ #define ETBFF_TRIGFL BIT(10) #define etb_writel(t, v, x) \ - (__raw_writel((v), (t)->etb_regs + (x))) -#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x))) + (writel_relaxed((v), (t)->etb_regs + (x))) +#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x))) #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0) #define etm_unlock(t) \ -- cgit v0.10.2 From 3460743e025addc1ecbd496db2231181a2431774 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 24 Jul 2013 15:44:56 +0100 Subject: ARM: net: fix arm instruction endian-ness in bpf_jit_32.c Use to correctly transform instruction byte ordering into in-memory ordering. Signed-off-by: Ben Dooks Reviewed-by: Dave Martin diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index f50d223..510d923 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "bpf_jit_32.h" @@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor) static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx) { + inst |= (cond << 28); + inst = __opcode_to_mem_arm(inst); + if (ctx->target != NULL) - ctx->target[ctx->idx] = inst | (cond << 28); + ctx->target[ctx->idx] = inst; ctx->idx++; } -- cgit v0.10.2 From 63328070eff2f4fd730c86966a0dbc976147c39f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 25 Jul 2013 14:38:03 +0100 Subject: ARM: Correct BUG() assembly to ensure it is endian-agnostic Currently BUG() uses .word or .hword to create the necessary illegal instructions. However if we are building BE8 then these get swapped by the linker into different illegal instructions in the text. This means that the BUG() macro does not get trapped properly. Change to using to provide the necessary ARM instruction building as we cannot rely on gcc/gas having the `.inst` instructions which where added to try and resolve this issue (reported by Dave Martin ). Signed-off-by: Ben Dooks Reviewed-by: Dave Martin diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index 7af5c6c..b274bde 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -2,6 +2,8 @@ #define _ASMARM_BUG_H #include +#include +#include #ifdef CONFIG_BUG @@ -12,10 +14,10 @@ */ #ifdef CONFIG_THUMB2_KERNEL #define BUG_INSTR_VALUE 0xde02 -#define BUG_INSTR_TYPE ".hword " +#define BUG_INSTR(__value) __inst_thumb16(__value) #else #define BUG_INSTR_VALUE 0xe7f001f2 -#define BUG_INSTR_TYPE ".word " +#define BUG_INSTR(__value) __inst_arm(__value) #endif @@ -33,7 +35,7 @@ #define __BUG(__file, __line, __value) \ do { \ - asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n" \ + asm volatile("1:\t" BUG_INSTR(__value) "\n" \ ".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \ "2:\t.asciz " #__file "\n" \ ".popsection\n" \ @@ -48,7 +50,7 @@ do { \ #define __BUG(__file, __line, __value) \ do { \ - asm volatile(BUG_INSTR_TYPE #__value); \ + asm volatile(BUG_INSTR(__value) "\n"); \ unreachable(); \ } while (0) #endif /* CONFIG_DEBUG_BUGVERBOSE */ diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index caf96da..6125f25 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -342,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs, int is_valid_bugaddr(unsigned long pc) { #ifdef CONFIG_THUMB2_KERNEL - unsigned short bkpt; + u16 bkpt; + u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE); #else - unsigned long bkpt; + u32 bkpt; + u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE); #endif if (probe_kernel_address((unsigned *)pc, bkpt)) return 0; - return bkpt == BUG_INSTR_VALUE; + return bkpt == insn; } #endif -- cgit v0.10.2 From 5a8b93fc9457be90adfa10d3df6497393c5e2dc2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 25 Jul 2013 15:47:40 +0100 Subject: ARM: kdgb: use for data to be assembled as intruction The arch_kgdb_breakpoint() function uses an inline assembly directive to assemble a specific instruction using .word. This means the linker will not treat is as an instruction, and therefore incorrectly swap the endian-ness if running BE8. As noted, this code means that kgdb is really only usable on arm32 kernels, and should be made dependant on not being a thumb2 kernel until fixed. However this is not something to be added to this patch. Signed-off-by: Ben Dooks Reviewed-by: Dave Martin diff --git a/arch/arm/include/asm/kgdb.h b/arch/arm/include/asm/kgdb.h index 48066ce..0a9d5dd 100644 --- a/arch/arm/include/asm/kgdb.h +++ b/arch/arm/include/asm/kgdb.h @@ -11,6 +11,7 @@ #define __ARM_KGDB_H__ #include +#include /* * GDB assumes that we're a user process being debugged, so @@ -41,7 +42,7 @@ static inline void arch_kgdb_breakpoint(void) { - asm(".word 0xe7ffdeff"); + asm(__inst_arm(0xe7ffdeff)); } extern void kgdb_handle_bus_error(void); -- cgit v0.10.2 From 2245f92498b216b50e744423bde17626287409d8 Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Fri, 26 Jul 2013 09:28:53 -0700 Subject: ARM: atomic64: fix endian-ness in atomic.h Fix inline asm for atomic64_xxx functions in arm atomic.h. Instead of %H operand specifiers code should use %Q for least significant part of the value, and %R for the most significant part of the value. %H always returns the higher of the two register numbers, and therefore it is not endian neutral. %H should be used with ldrexd and strexd instructions. Signed-off-by: Victor Kamensky Acked-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index da1c77d..6447a0b 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -301,8 +301,8 @@ static inline void atomic64_add(u64 i, atomic64_t *v) __asm__ __volatile__("@ atomic64_add\n" "1: ldrexd %0, %H0, [%3]\n" -" adds %0, %0, %4\n" -" adc %H0, %H0, %H4\n" +" adds %Q0, %Q0, %Q4\n" +" adc %R0, %R0, %R4\n" " strexd %1, %0, %H0, [%3]\n" " teq %1, #0\n" " bne 1b" @@ -320,8 +320,8 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v) __asm__ __volatile__("@ atomic64_add_return\n" "1: ldrexd %0, %H0, [%3]\n" -" adds %0, %0, %4\n" -" adc %H0, %H0, %H4\n" +" adds %Q0, %Q0, %Q4\n" +" adc %R0, %R0, %R4\n" " strexd %1, %0, %H0, [%3]\n" " teq %1, #0\n" " bne 1b" @@ -341,8 +341,8 @@ static inline void atomic64_sub(u64 i, atomic64_t *v) __asm__ __volatile__("@ atomic64_sub\n" "1: ldrexd %0, %H0, [%3]\n" -" subs %0, %0, %4\n" -" sbc %H0, %H0, %H4\n" +" subs %Q0, %Q0, %Q4\n" +" sbc %R0, %R0, %R4\n" " strexd %1, %0, %H0, [%3]\n" " teq %1, #0\n" " bne 1b" @@ -360,8 +360,8 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v) __asm__ __volatile__("@ atomic64_sub_return\n" "1: ldrexd %0, %H0, [%3]\n" -" subs %0, %0, %4\n" -" sbc %H0, %H0, %H4\n" +" subs %Q0, %Q0, %Q4\n" +" sbc %R0, %R0, %R4\n" " strexd %1, %0, %H0, [%3]\n" " teq %1, #0\n" " bne 1b" @@ -428,9 +428,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v) __asm__ __volatile__("@ atomic64_dec_if_positive\n" "1: ldrexd %0, %H0, [%3]\n" -" subs %0, %0, #1\n" -" sbc %H0, %H0, #0\n" -" teq %H0, #0\n" +" subs %Q0, %Q0, #1\n" +" sbc %R0, %R0, #0\n" +" teq %R0, #0\n" " bmi 2f\n" " strexd %1, %0, %H0, [%3]\n" " teq %1, #0\n" @@ -459,8 +459,8 @@ static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u) " teqeq %H0, %H5\n" " moveq %1, #0\n" " beq 2f\n" -" adds %0, %0, %6\n" -" adc %H0, %H0, %H6\n" +" adds %Q0, %Q0, %Q6\n" +" adc %R0, %R0, %R6\n" " strexd %2, %0, %H0, [%4]\n" " teq %2, #0\n" " bne 1b\n" -- cgit v0.10.2 From 574e2b5111e13827da501771b27d92e6e3f2e3d7 Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Tue, 27 Aug 2013 22:41:57 -0700 Subject: ARM: signal: sigreturn_codes should be endian neutral to work in BE8 In case of BE8 kernel data is in BE order whereas code stays in LE order. Move sigreturn_codes to separate .S file and use proper assembler mnemonics for these code snippets. In this case compiler will take care of proper instructions byteswaps for BE8 case. Change assumes that sufficiently Thumb-capable tools are used to build kernel. Problem was discovered during ltp testing of BE system: all rt_sig* tests failed. Tested against the same tests in both BE and LE modes. Signed-off-by: Victor Kamensky Reviewed-by: Dave Martin Signed-off-by: Ben Dooks diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 5140df5f..39c9834 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg obj-y := elf.o entry-common.o irq.o opcodes.o \ process.o ptrace.o return_address.o \ - setup.o signal.o stacktrace.o sys_arm.o time.o traps.o + setup.o signal.o sigreturn_codes.o \ + stacktrace.o sys_arm.o time.o traps.o obj-$(CONFIG_ATAGS) += atags_parse.o obj-$(CONFIG_ATAGS_PROC) += atags_proc.o diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ab33042..64845fc 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -21,29 +21,7 @@ #include #include -/* - * For ARM syscalls, we encode the syscall number into the instruction. - */ -#define SWI_SYS_SIGRETURN (0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)) -#define SWI_SYS_RT_SIGRETURN (0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)) - -/* - * With EABI, the syscall number has to be loaded into r7. - */ -#define MOV_R7_NR_SIGRETURN (0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE)) -#define MOV_R7_NR_RT_SIGRETURN (0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE)) - -/* - * For Thumb syscalls, we pass the syscall number via r7. We therefore - * need two 16-bit instructions. - */ -#define SWI_THUMB_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE)) -#define SWI_THUMB_RT_SIGRETURN (0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE)) - -static const unsigned long sigreturn_codes[7] = { - MOV_R7_NR_SIGRETURN, SWI_SYS_SIGRETURN, SWI_THUMB_SIGRETURN, - MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN, -}; +extern const unsigned long sigreturn_codes[7]; static unsigned long signal_return_offset; diff --git a/arch/arm/kernel/sigreturn_codes.S b/arch/arm/kernel/sigreturn_codes.S new file mode 100644 index 0000000..3c5d0f2 --- /dev/null +++ b/arch/arm/kernel/sigreturn_codes.S @@ -0,0 +1,80 @@ +/* + * sigreturn_codes.S - code sinpets for sigreturn syscalls + * + * Created by: Victor Kamensky, 2013-08-13 + * Copyright: (C) 2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include + +/* + * For ARM syscalls, we encode the syscall number into the instruction. + * With EABI, the syscall number has to be loaded into r7. As result + * ARM syscall sequence snippet will have move and svc in .arm encoding + * + * For Thumb syscalls, we pass the syscall number via r7. We therefore + * need two 16-bit instructions in .thumb encoding + * + * Please note sigreturn_codes code are not executed in place. Instead + * they just copied by kernel into appropriate places. Code inside of + * arch/arm/kernel/signal.c is very sensitive to layout of these code + * snippets. + */ + +#if __LINUX_ARM_ARCH__ <= 4 + /* + * Note we manually set minimally required arch that supports + * required thumb opcodes for early arch versions. It is OK + * for this file to be used in combination with other + * lower arch variants, since these code snippets are only + * used as input data. + */ + .arch armv4t +#endif + + .section .rodata + .global sigreturn_codes + .type sigreturn_codes, #object + + .arm + +sigreturn_codes: + + /* ARM sigreturn syscall code snippet */ + mov r7, #(__NR_sigreturn - __NR_SYSCALL_BASE) + swi #(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE) + + /* Thumb sigreturn syscall code snippet */ + .thumb + movs r7, #(__NR_sigreturn - __NR_SYSCALL_BASE) + swi #0 + + /* ARM sigreturn_rt syscall code snippet */ + .arm + mov r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE) + swi #(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE) + + /* Thumb sigreturn_rt syscall code snippet */ + .thumb + movs r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE) + swi #0 + + /* + * Note on addtional space: setup_return in signal.c + * algorithm uses two words copy regardless whether + * it is thumb case or not, so we need additional + * word after real last entry. + */ + .arm + .space 4 + + .size sigreturn_codes, . - sigreturn_codes -- cgit v0.10.2 From 519ceb9fd10cd7e836d0aa97b2068cc9e97f463b Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Mon, 7 Oct 2013 21:37:19 -0700 Subject: ARM: mcpm: fix big endian issue in mcpm startup code In big endian mode mcpm_entry_point is first function that called on secondaries CPU. First it should switch CPU into big endian code. [ben.dooks@codethink.co.uk: merge fix patch from Victor into this] Signed-off-by: Victor Kamensky Acked-by: Nicolas Pitre Reviewed-by: Dave Martin Signed-off-by: Ben Dooks diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S index 39c96df..4f88f5e 100644 --- a/arch/arm/common/mcpm_head.S +++ b/arch/arm/common/mcpm_head.S @@ -15,6 +15,7 @@ #include #include +#include #include "vlock.h" @@ -47,6 +48,7 @@ ENTRY(mcpm_entry_point) + ARM_BE8(setend be) THUMB( adr r12, BSYM(1f) ) THUMB( bx r12 ) THUMB( .thumb ) -- cgit v0.10.2 From a1af3474487cc3b8731b990dceac6b6aad7f3ed8 Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Mon, 7 Oct 2013 08:48:23 -0700 Subject: ARM: tlb: ASID macro should give 32bit result for BE correct operation In order for ASID macro to be used as expression passed to inline asm as 'r' operand it needs to give 32 bit unsigned result, not unsigned 64bit expression. Otherwise when 64bit ASID is passed to inline assembler statement as 'r' operand (32bit) compiler behavior is not well specified. For example when __flush_tlb_mm function compiled in big endian case, and ASID is passed to tlb_op macro directly, 0 will be passed as 'mcr 15, 0, r4, cr8, cr3, {2}' argument in r4, unless ASID macro changed to produce 32 bit result. Signed-off-by: Victor Kamensky Acked-by: Will Deacon Signed-off-by: Ben Dooks diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 6f18da0..64fd151 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -16,7 +16,7 @@ typedef struct { #ifdef CONFIG_CPU_HAS_ASID #define ASID_BITS 8 #define ASID_MASK ((~0ULL) << ASID_BITS) -#define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) +#define ASID(mm) ((unsigned int)((mm)->context.id.counter & ~ASID_MASK)) #else #define ASID(mm) (0) #endif -- cgit v0.10.2 From fdb07aee0b2b9d7d1893c97f5ce79ec355caaf1f Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Tue, 15 Oct 2013 21:50:34 -0700 Subject: ARM: cci driver need big endian fixes in asm code cci_enable_port_for_self written in asm and it works with h/w registers that are in little endian format. When run in big endian mode it needs byteswaped constants before/after it writes/reads to/from such registers Signed-off-by: Victor Kamensky Acked-by: Nicolas Pitre Signed-off-by: Ben Dooks diff --git a/drivers/bus/arm-cci.c b/drivers/bus/arm-cci.c index 2009266..2e6c275 100644 --- a/drivers/bus/arm-cci.c +++ b/drivers/bus/arm-cci.c @@ -280,7 +280,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) /* Enable the CCI port */ " ldr r0, [r0, %[offsetof_port_phys]] \n" -" mov r3, #"__stringify(CCI_ENABLE_REQ)" \n" +" mov r3, %[cci_enable_req]\n" " str r3, [r0, #"__stringify(CCI_PORT_CTRL)"] \n" /* poll the status reg for completion */ @@ -288,7 +288,7 @@ asmlinkage void __naked cci_enable_port_for_self(void) " ldr r0, [r1] \n" " ldr r0, [r0, r1] @ cci_ctrl_base \n" "4: ldr r1, [r0, #"__stringify(CCI_CTRL_STATUS)"] \n" -" tst r1, #1 \n" +" tst r1, %[cci_control_status_bits] \n" " bne 4b \n" " mov r0, #0 \n" @@ -301,6 +301,8 @@ asmlinkage void __naked cci_enable_port_for_self(void) "7: .word cci_ctrl_phys - . \n" : : [sizeof_cpu_port] "i" (sizeof(cpu_port)), + [cci_enable_req] "i" cpu_to_le32(CCI_ENABLE_REQ), + [cci_control_status_bits] "i" cpu_to_le32(1), #ifndef __ARMEB__ [offsetof_cpu_port_mpidr_lsb] "i" (offsetof(struct cpu_port, mpidr)), #else -- cgit v0.10.2 From 59fd3033b55642da97b4ecde0c85de78d7229675 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 21 Oct 2013 09:33:49 +0100 Subject: ARM: fix build errors caused by selection of errata 798181 Several configurations are selecting errata 798181 without SMP being selected. This causes a warning from Kconfig: warning: (ARCH_HIGHBANK && ARCH_KEYSTONE && SOC_OMAP5 && ARCH_TEGRA_114_SOC) selects ARM_ERRATA_798181 which has unmet direct dependencies (CPU_V7 && SMP) The dependencies are compile time dependencies; select violates these, resulting in: arch/arm/kernel/built-in.o: In function `setup_processor': psci.c:(.init.text+0x808): undefined reference to `erratum_a15_798181_init' at build time. Fix this by fixing the select statements for Tegra and Highbank. Signed-off-by: Russell King diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig index 8e8437d..e2ca238 100644 --- a/arch/arm/mach-highbank/Kconfig +++ b/arch/arm/mach-highbank/Kconfig @@ -8,7 +8,7 @@ config ARCH_HIGHBANK select ARM_AMBA select ARM_ERRATA_764369 select ARM_ERRATA_775420 - select ARM_ERRATA_798181 + select ARM_ERRATA_798181 if SMP select ARM_GIC select ARM_TIMER_SP804 select CACHE_L2X0 diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig index 67a76f2..f26428d 100644 --- a/arch/arm/mach-tegra/Kconfig +++ b/arch/arm/mach-tegra/Kconfig @@ -54,7 +54,7 @@ config ARCH_TEGRA_3x_SOC config ARCH_TEGRA_114_SOC bool "Enable support for Tegra114 family" select HAVE_ARM_ARCH_TIMER - select ARM_ERRATA_798181 + select ARM_ERRATA_798181 if SMP select ARM_L1_CACHE_SHIFT_6 select PINCTRL_TEGRA114 help -- cgit v0.10.2 From 0ea1ec713f04bdfac343c9702b21cd3a7c711826 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 23 Oct 2013 16:14:59 +0100 Subject: ARM: dma-mapping: don't allow DMA mappings to be marked executable DMA mapping permissions were being derived from pgprot_kernel directly without using PAGE_KERNEL. This causes them to be marked with executable permission, which is not what we want. Fix this. Signed-off-by: Russell King diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index f5e1a84..5743850 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -687,7 +687,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) { - pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel); + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL); void *memory; if (dma_alloc_from_coherent(dev, size, handle, &memory)) @@ -700,7 +700,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, static void *arm_coherent_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs) { - pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel); + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL); void *memory; if (dma_alloc_from_coherent(dev, size, handle, &memory)) -- cgit v0.10.2 From 830fd4d6de1785942e34babe0a8984f72b534b25 Mon Sep 17 00:00:00 2001 From: Sricharan R Date: Tue, 29 Oct 2013 07:29:56 +0100 Subject: ARM: 7870/1: head: Fix the missing underscore in __ARMEB__ macro and .align keyword Commit 'f52bb722547f43caeaecbcc62db9f3c3b80ead9b' Author: Sricharan R ARM: mm: Correct virt_to_phys patching for 64 bit physical addresses introduced a __ARMEB__ macro usage in a new place, but missed the second underscore. So correcting it here. Also a explicit .align keyword is needed for the label with .long data-type to be aligned on the 4 byte boundary. Otherwise this can cause problem for thumb2 build. So adding it here. Signed-off-by: Sricharan R Signed-off-by: Russell King diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 5454794..32402ba 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -536,7 +536,7 @@ ENTRY(fixup_smp) ldmfd sp!, {r4 - r6, pc} ENDPROC(fixup_smp) -#ifdef __ARMEB_ +#ifdef __ARMEB__ #define LOW_OFFSET 0x4 #define HIGH_OFFSET 0x0 #else @@ -632,6 +632,7 @@ __fixup_a_pv_table: #endif ENDPROC(__fixup_a_pv_table) + .align 3: .long __pv_offset ENTRY(fixup_pv_table) -- cgit v0.10.2 From 9a48aa4cafa994161d21e14b454f7191cf180ec3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 25 Oct 2013 16:23:30 +0100 Subject: ARM: 7865/1: sa1100: fix simpad compilation warning When removing the complex GPIO interface from the SA1100 machines, we also removed the implicit #includes for a few header files that was included by thru , causing a compile warning on the simpad boardfile, as was no longer #included, as follows: ./../arch/arm/include/asm/irq.h:9:0: warning: "NR_IRQS" redefined [enabled by default] #define NR_IRQS NR_IRQS_LEGACY ^ In file included from ../../arch/arm/mach-sa1100/simpad.c:29:0: ../../arch/arm/mach-sa1100/include/mach/irqs.h:87:0: note: this is the location of the previous definition #define NR_IRQS (IRQ_BOARD_START + NR_IRQS_LOCOMO) This resolves the problem by explicitly including into the simpad boardfile. Reported-by: Olof Johansson Cc: Alexandre Courbot Cc: Kristoffer Ericson Cc: Kevin Hilman Signed-off-by: Linus Walleij Signed-off-by: Russell King diff --git a/arch/arm/mach-sa1100/simpad.c b/arch/arm/mach-sa1100/simpad.c index bcbc945..41e476e 100644 --- a/arch/arm/mach-sa1100/simpad.c +++ b/arch/arm/mach-sa1100/simpad.c @@ -19,6 +19,7 @@ #include #include +#include #include #include -- cgit v0.10.2 From 40ca061b1bb8827609d39679ddde6705551cb031 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 25 Sep 2013 13:33:55 +0100 Subject: ARM: 7841/1: sa1100: remove complex GPIO interface The SA1100 was implementing its own variants of gpio_get_value() and gpio_set_value() and only selectively falling back to gpiolib for extended (EGPIO) handling. However the driver in gpio/gpio-sa1100.c already handles the same functionality for these lines, yet remain unused. The only upside would be things like a timing-critical hotpath on bit-banged GPIO, but that kind of things does not seem to happen on these GPIOs, so it is not worth having the extra complexity. Tested with some buttons on the Compaq iPAQ H3630. Cc: Kristoffer Ericson Signed-off-by: Linus Walleij Signed-off-by: Russell King diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 3f7714d..74bf29f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -692,7 +692,6 @@ config ARCH_SA1100 select GENERIC_CLOCKEVENTS select HAVE_IDE select ISA - select NEED_MACH_GPIO_H select NEED_MACH_MEMORY_H select SPARSE_IRQ help diff --git a/arch/arm/mach-sa1100/include/mach/gpio.h b/arch/arm/mach-sa1100/include/mach/gpio.h deleted file mode 100644 index 6a9eecf..0000000 --- a/arch/arm/mach-sa1100/include/mach/gpio.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * arch/arm/mach-sa1100/include/mach/gpio.h - * - * SA1100 GPIO wrappers for arch-neutral GPIO calls - * - * Written by Philipp Zabel - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#ifndef __ASM_ARCH_SA1100_GPIO_H -#define __ASM_ARCH_SA1100_GPIO_H - -#include -#include -#include -#include - -#define __ARM_GPIOLIB_COMPLEX - -static inline int gpio_get_value(unsigned gpio) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - return GPLR & GPIO_GPIO(gpio); - else - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned gpio, int value) -{ - if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX)) - if (value) - GPSR = GPIO_GPIO(gpio); - else - GPCR = GPIO_GPIO(gpio); - else - __gpio_set_value(gpio, value); -} - -#define gpio_cansleep __gpio_cansleep - -#endif diff --git a/arch/arm/mach-sa1100/include/mach/h3xxx.h b/arch/arm/mach-sa1100/include/mach/h3xxx.h index 7d9df16..c810620 100644 --- a/arch/arm/mach-sa1100/include/mach/h3xxx.h +++ b/arch/arm/mach-sa1100/include/mach/h3xxx.h @@ -13,6 +13,8 @@ #ifndef _INCLUDE_H3XXX_H_ #define _INCLUDE_H3XXX_H_ +#include "hardware.h" /* Gives GPIO_MAX */ + /* Physical memory regions corresponding to chip selects */ #define H3600_EGPIO_PHYS (SA1100_CS5_PHYS + 0x01000000) #define H3600_BANK_2_PHYS SA1100_CS2_PHYS diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c index 8ea3b33..a90be34 100644 --- a/drivers/gpio/gpio-sa1100.c +++ b/drivers/gpio/gpio-sa1100.c @@ -10,7 +10,7 @@ #include #include #include - +#include #include #include -- cgit v0.10.2 From e812a3c15f0299ba70f0f7b5710db2a1a1c6643b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 4 Oct 2013 19:02:05 +0100 Subject: ARM: 7849/2: h3600: update defconfig This updates the h3600 defconfig against the latest kernel with some small options coming and going due to Kconfig structure, then modernize it to: - Configure for low latency preemptive kernel - Configure for tickless idle - Enable HRtimers Tested on the iPAQ h3630. Signed-off-by: Linus Walleij Signed-off-by: Russell King diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig index 317960f..0142ec3 100644 --- a/arch/arm/configs/h3600_defconfig +++ b/arch/arm/configs/h3600_defconfig @@ -1,5 +1,6 @@ -CONFIG_EXPERIMENTAL=y CONFIG_SYSVIPC=y +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y CONFIG_MODULES=y @@ -11,11 +12,11 @@ CONFIG_ARCH_SA1100=y CONFIG_SA1100_H3600=y CONFIG_PCCARD=y CONFIG_PCMCIA_SA1100=y +CONFIG_PREEMPT=y CONFIG_ZBOOT_ROM_TEXT=0x0 CONFIG_ZBOOT_ROM_BSS=0x0 # CONFIG_CPU_FREQ_STAT is not set CONFIG_FPE_NWFPE=y -CONFIG_PM=y CONFIG_NET=y CONFIG_UNIX=y CONFIG_INET=y @@ -24,13 +25,10 @@ CONFIG_IRDA=m CONFIG_IRLAN=m CONFIG_IRNET=m CONFIG_IRCOMM=m -CONFIG_SA1100_FIR=m # CONFIG_WIRELESS is not set CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y -CONFIG_MTD_PARTITIONS=y CONFIG_MTD_REDBOOT_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_ADV_OPTIONS=y @@ -41,19 +39,15 @@ CONFIG_MTD_SA1100=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 -# CONFIG_MISC_DEVICES is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDECS=y CONFIG_NETDEVICES=y -# CONFIG_NETDEV_1000 is not set -# CONFIG_NETDEV_10000 is not set -# CONFIG_WLAN is not set -CONFIG_NET_PCMCIA=y CONFIG_PCMCIA_PCNET=y CONFIG_PPP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_ASYNC=m +# CONFIG_WLAN is not set # CONFIG_KEYBOARD_ATKBD is not set CONFIG_KEYBOARD_GPIO=y # CONFIG_INPUT_MOUSE is not set @@ -64,8 +58,6 @@ CONFIG_SERIAL_SA1100_CONSOLE=y # CONFIG_HWMON is not set CONFIG_FB=y CONFIG_FB_SA1100=y -# CONFIG_VGA_CONSOLE is not set -# CONFIG_HID_SUPPORT is not set # CONFIG_USB_SUPPORT is not set CONFIG_EXT2_FS=y CONFIG_MSDOS_FS=m @@ -74,6 +66,4 @@ CONFIG_JFFS2_FS=y CONFIG_CRAMFS=m CONFIG_NFS_FS=y CONFIG_NFSD=m -CONFIG_SMB_FS=m CONFIG_NLS=y -# CONFIG_RCU_CPU_STALL_DETECTOR is not set -- cgit v0.10.2 From f3964fe1c9d9a887d65faf594669852e4dec46e0 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 16 Oct 2013 00:09:02 +0100 Subject: ARM: sa11x0/assabet: ensure CS2 is configured appropriately The CS2 region contains the Assabet board configuration and status registers, which are 32-bit. Unfortunately, some boot loaders do not configure this region correctly, leaving it setup as a 16-bit region. Fix this. Cc: Signed-off-by: Russell King diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index e838ba2..c9808c6 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -512,6 +512,9 @@ static void __init assabet_map_io(void) * Its called GPCLKR0 in my SA1110 manual. */ Ser1SDCR0 |= SDCR0_SUS; + MSC1 = (MSC1 & ~0xffff) | + MSC_NonBrst | MSC_32BitStMem | + MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0); if (!machine_has_neponset()) sa1100_register_uart_fns(&assabet_port_fns); -- cgit v0.10.2 From 3159f372354e8e1f5dee714663d705dd2c7e0759 Mon Sep 17 00:00:00 2001 From: Sergey Dyasly Date: Tue, 24 Sep 2013 16:38:00 +0100 Subject: ARM: 7840/1: LPAE: don't reject mapping /dev/mem above 4GB With LPAE enabled, physical address space is larger than 4GB. Allow mapping any part of it via /dev/mem by using PHYS_MASK to determine valid range. PHYS_MASK covers 40 bits with LPAE enabled and 32 bits otherwise. Reported-by: Vassili Karpov Signed-off-by: Sergey Dyasly Acked-by: Catalin Marinas Signed-off-by: Russell King diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 0c63562..d27158c 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -202,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size) } /* - * We don't use supersection mappings for mmap() on /dev/mem, which - * means that we can't map the memory area above the 4G barrier into - * userspace. + * Do not allow /dev/mem mappings beyond the supported physical range. */ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { - return !(pfn + (size >> PAGE_SHIFT) > 0x00100000); + return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT)); } #ifdef CONFIG_STRICT_DEVMEM -- cgit v0.10.2 From 494e492dd88d36cd201eae99873492450d1e9b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 8 Oct 2013 16:44:17 +0100 Subject: ARM: 7850/1: DEBUG_LL on efm32 SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements output of debug messages on efm32 SoCs. Signed-off-by: Uwe Kleine-König Signed-off-by: Russell King diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 9762c84..2b32068 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -834,6 +834,20 @@ choice options; the platform specific options are deprecated and will be soon removed. + config DEBUG_LL_UART_EFM32 + bool "Kernel low-level debugging via efm32 UART" + depends on ARCH_EFM32 + help + Say Y here if you want the debug print routines to direct + their output to an UART or USART port on efm32 based + machines. Use the following addresses for DEBUG_UART_PHYS: + + 0x4000c000 | USART0 + 0x4000c400 | USART1 + 0x4000c800 | USART2 + 0x4000e000 | UART0 + 0x4000e400 | UART1 + config DEBUG_LL_UART_PL01X bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART" help @@ -885,6 +899,7 @@ config DEBUG_LL_INCLUDE default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250 default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X default "debug/exynos.S" if DEBUG_EXYNOS_UART + default "debug/efm32.S" if DEBUG_LL_UART_EFM32 default "debug/icedcc.S" if DEBUG_ICEDCC default "debug/imx.S" if DEBUG_IMX1_UART || \ DEBUG_IMX25_UART || \ @@ -951,6 +966,7 @@ config DEBUG_UART_PHYS default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2 default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3 default 0x20201000 if DEBUG_BCM2835 + default 0x4000e400 if DEBUG_LL_UART_EFM32 default 0x40090000 if ARCH_LPC32XX default 0x40100000 if DEBUG_PXA_UART1 default 0x42000000 if ARCH_GEMINI @@ -981,6 +997,7 @@ config DEBUG_UART_PHYS default 0xfff36000 if DEBUG_HIGHBANK_UART default 0xfffff700 if ARCH_IOP33X depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \ + DEBUG_LL_UART_EFM32 || \ DEBUG_UART_8250 || DEBUG_UART_PL01X config DEBUG_UART_VIRT diff --git a/arch/arm/include/debug/efm32.S b/arch/arm/include/debug/efm32.S new file mode 100644 index 0000000..2265a19 --- /dev/null +++ b/arch/arm/include/debug/efm32.S @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2013 Pengutronix + * Uwe Kleine-Koenig + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define UARTn_CMD 0x000c +#define UARTn_CMD_TXEN 0x0004 + +#define UARTn_STATUS 0x0010 +#define UARTn_STATUS_TXC 0x0020 +#define UARTn_STATUS_TXBL 0x0040 + +#define UARTn_TXDATA 0x0034 + + .macro addruart, rx, tmp + ldr \rx, =(CONFIG_DEBUG_UART_PHYS) + + /* + * enable TX. The driver might disable it to save energy. We + * don't care about disabling at the end as during debug power + * consumption isn't that important. + */ + ldr \tmp, =(UARTn_CMD_TXEN) + str \tmp, [\rx, #UARTn_CMD] + .endm + + .macro senduart,rd,rx + strb \rd, [\rx, #UARTn_TXDATA] + .endm + + .macro waituart,rd,rx +1001: ldr \rd, [\rx, #UARTn_STATUS] + tst \rd, #UARTn_STATUS_TXBL + beq 1001b + .endm + + .macro busyuart,rd,rx +1001: ldr \rd, [\rx, UARTn_STATUS] + tst \rd, #UARTn_STATUS_TXC + bne 1001b + .endm -- cgit v0.10.2 From 2523c67bb6962f98193dce1c73b6efb65a6ea92c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Oct 2013 17:01:21 +0100 Subject: ARM: 7852/1: cmpxchg: implement barrier-less cmpxchg64_local Our cmpxchg64 macros are wrappers around atomic64_cmpxchg. Whilst this is great for code re-use, there is a case for barrier-less cmpxchg where it is known to be safe (for example cmpxchg64_local and cmpxchg-based lockrefs). This patch introduces a 64-bit cmpxchg implementation specifically for the cmpxchg64_* macros, so that it can be later used by the lockref code. Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h index 4f009c1..fbd978f 100644 --- a/arch/arm/include/asm/cmpxchg.h +++ b/arch/arm/include/asm/cmpxchg.h @@ -223,6 +223,42 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, return ret; } +static inline unsigned long long __cmpxchg64(unsigned long long *ptr, + unsigned long long old, + unsigned long long new) +{ + unsigned long long oldval; + unsigned long res; + + __asm__ __volatile__( +"1: ldrexd %1, %H1, [%3]\n" +" teq %1, %4\n" +" teqeq %H1, %H4\n" +" bne 2f\n" +" strexd %0, %5, %H5, [%3]\n" +" teq %0, #0\n" +" bne 1b\n" +"2:" + : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr) + : "r" (ptr), "r" (old), "r" (new) + : "cc"); + + return oldval; +} + +static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr, + unsigned long long old, + unsigned long long new) +{ + unsigned long long ret; + + smp_mb(); + ret = __cmpxchg64(ptr, old, new); + smp_mb(); + + return ret; +} + #define cmpxchg_local(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg_local((ptr), \ (unsigned long)(o), \ @@ -230,18 +266,14 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr, sizeof(*(ptr)))) #define cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr), \ - atomic64_t, \ - counter), \ - (unsigned long long)(o), \ - (unsigned long long)(n))) + ((__typeof__(*(ptr)))__cmpxchg64_mb((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n))) #define cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr), \ - local64_t, \ - a), \ - (unsigned long long)(o), \ - (unsigned long long)(n))) + ((__typeof__(*(ptr)))__cmpxchg64((ptr), \ + (unsigned long long)(o), \ + (unsigned long long)(n))) #endif /* __LINUX_ARM_ARCH__ >= 6 */ -- cgit v0.10.2 From 775ebcc16b940ebf61bf54d6054a5e639f68b9d6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Oct 2013 17:17:18 +0100 Subject: ARM: 7853/1: cmpxchg: implement cmpxchg64_relaxed This patch introduces cmpxchg64_relaxed for arm, which performs a 64-bit cmpxchg operation without barrier semantics. cmpxchg64_local is updated to use the new operation. Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h index fbd978f..df2fbba 100644 --- a/arch/arm/include/asm/cmpxchg.h +++ b/arch/arm/include/asm/cmpxchg.h @@ -270,11 +270,13 @@ static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr, (unsigned long long)(o), \ (unsigned long long)(n))) -#define cmpxchg64_local(ptr, o, n) \ +#define cmpxchg64_relaxed(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64((ptr), \ (unsigned long long)(o), \ (unsigned long long)(n))) +#define cmpxchg64_local(ptr, o, n) cmpxchg64_relaxed((ptr), (o), (n)) + #endif /* __LINUX_ARM_ARCH__ >= 6 */ #endif /* __ASM_ARM_CMPXCHG_H */ -- cgit v0.10.2 From 0cbad9c9dfe0c38e8ec7385b39087c005a6dee3e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Oct 2013 17:19:22 +0100 Subject: ARM: 7854/1: lockref: add support for lockless lockrefs using cmpxchg64 Our spinlocks are only 32-bit (2x16-bit tickets) and, on processors with 64-bit atomic instructions, cmpxchg64 makes use of the double-word exclusive accessors. This patch wires up the cmpxchg-based lockless lockref implementation for ARM. Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1ad6fb6..fc184bc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -5,6 +5,7 @@ config ARM select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H + select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT if MMU select CLONE_BACKWARDS diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4f2c280..ed6c229 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -127,10 +127,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) dsb_sev(); } +static inline int arch_spin_value_unlocked(arch_spinlock_t lock) +{ + return lock.tickets.owner == lock.tickets.next; +} + static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets); - return tickets.owner != tickets.next; + return !arch_spin_value_unlocked(ACCESS_ONCE(*lock)); } static inline int arch_spin_is_contended(arch_spinlock_t *lock) -- cgit v0.10.2 From 92871b94a5f9892e324c31960678387922c75049 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 9 Oct 2013 17:26:44 +0100 Subject: ARM: 7855/1: Add check for Cortex-A15 errata 798181 ECO The work-around for A15 errata 798181 is not needed if appropriate ECO fixes have been applied to r3p2 and earlier core revisions. This can be checked by reading REVIDR register bits 4 and 9. If only bit 4 is set, then the IPI broadcast can be skipped. Signed-off-by: Rob Herring Reviewed-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h index 9672e97..acdde76 100644 --- a/arch/arm/include/asm/cputype.h +++ b/arch/arm/include/asm/cputype.h @@ -10,6 +10,7 @@ #define CPUID_TLBTYPE 3 #define CPUID_MPUIR 4 #define CPUID_MPIDR 5 +#define CPUID_REVIDR 6 #ifdef CONFIG_CPU_V7M #define CPUID_EXT_PFR0 0x40 diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 3896026..def9e57 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -560,37 +560,6 @@ static inline void __flush_bp_all(void) asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero)); } -#include -#ifdef CONFIG_ARM_ERRATA_798181 -static inline int erratum_a15_798181(void) -{ - unsigned int midr = read_cpuid_id(); - - /* Cortex-A15 r0p0..r3p2 affected */ - if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2) - return 0; - return 1; -} - -static inline void dummy_flush_tlb_a15_erratum(void) -{ - /* - * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0. - */ - asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0)); - dsb(ish); -} -#else -static inline int erratum_a15_798181(void) -{ - return 0; -} - -static inline void dummy_flush_tlb_a15_erratum(void) -{ -} -#endif - /* * flush_pmd_entry * @@ -697,4 +666,21 @@ extern void flush_bp_all(void); #endif +#ifndef __ASSEMBLY__ +#ifdef CONFIG_ARM_ERRATA_798181 +extern void erratum_a15_798181_init(void); +#else +static inline void erratum_a15_798181_init(void) {} +#endif +extern bool (*erratum_a15_798181_handler)(void); + +static inline bool erratum_a15_798181(void) +{ + if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) && + erratum_a15_798181_handler)) + return erratum_a15_798181_handler(); + return false; +} +#endif + #endif diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 0e1e2b3..a4852de 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -599,6 +599,8 @@ static void __init setup_processor(void) elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT); #endif + erratum_a15_798181_init(); + feat_v6_fixup(); cacheid_init(); diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 83ccca3..95d0636 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -70,6 +70,40 @@ static inline void ipi_flush_bp_all(void *ignored) local_flush_bp_all(); } +#ifdef CONFIG_ARM_ERRATA_798181 +bool (*erratum_a15_798181_handler)(void); + +static bool erratum_a15_798181_partial(void) +{ + asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0)); + dsb(ish); + return false; +} + +static bool erratum_a15_798181_broadcast(void) +{ + asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0)); + dsb(ish); + return true; +} + +void erratum_a15_798181_init(void) +{ + unsigned int midr = read_cpuid_id(); + unsigned int revidr = read_cpuid(CPUID_REVIDR); + + /* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */ + if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 || + (revidr & 0x210) == 0x210) { + return; + } + if (revidr & 0x10) + erratum_a15_798181_handler = erratum_a15_798181_partial; + else + erratum_a15_798181_handler = erratum_a15_798181_broadcast; +} +#endif + static void ipi_flush_tlb_a15_erratum(void *arg) { dmb(); @@ -80,7 +114,6 @@ static void broadcast_tlb_a15_erratum(void) if (!erratum_a15_798181()) return; - dummy_flush_tlb_a15_erratum(); smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1); } @@ -92,7 +125,6 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm) if (!erratum_a15_798181()) return; - dummy_flush_tlb_a15_erratum(); this_cpu = get_cpu(); a15_erratum_get_cpumask(this_cpu, mm, &mask); smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1); -- cgit v0.10.2 From a3a9ea656d19251326cdeaaa0b5adbfac41ddacf Mon Sep 17 00:00:00 2001 From: Steven Capper Date: Mon, 14 Oct 2013 09:49:10 +0100 Subject: ARM: 7858/1: mm: make UACCESS_WITH_MEMCPY huge page aware The memory pinning code in uaccess_with_memcpy.c does not check for HugeTLB or THP pmds, and will enter an infinite loop should a __copy_to_user or __clear_user occur against a huge page. This patch adds detection code for huge pages to pin_page_for_write. As this code can be executed in a fast path it refers to the actual pmds rather than the vma. If a HugeTLB or THP is found (they have the same pmd representation on ARM), the page table spinlock is taken to prevent modification whilst the page is pinned. On ARM, huge pages are only represented as pmds, thus no huge pud checks are performed. (For huge puds one would lock the page table in a similar manner as in the pmd case). Two helper functions are introduced; pmd_thp_or_huge will check whether or not a page is huge or transparent huge (which have the same pmd layout on ARM), and pmd_hugewillfault will detect whether or not a page fault will occur on write to the page. Running the following test (with the chunking from read_zero removed): $ dd if=/dev/zero of=/dev/null bs=10M count=1024 Gave: 2.3 GB/s backed by normal pages, 2.9 GB/s backed by huge pages, 5.1 GB/s backed by huge pages, with page mask=HPAGE_MASK. After some discussion, it was decided not to adopt the HPAGE_MASK, as this would have a significant detrimental effect on the overall system latency due to page_table_lock being held for too long. This could be revisited if split huge page locks are adopted. Signed-off-by: Steve Capper Reviewed-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index f97ee02..86a659a 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -181,6 +181,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) +/* + * We don't have huge page support for short descriptors, for the moment + * define empty stubs for use by pin_page_for_write. + */ +#define pmd_hugewillfault(pmd) (0) +#define pmd_thp_or_huge(pmd) (0) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_PGTABLE_2LEVEL_H */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 5689c18..39c54cf 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) (!(pmd_val(pmd) & PMD_SECT_RDONLY)) +#define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd)) +#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING) diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 025f742..3e58d71 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -18,6 +18,7 @@ #include /* for in_atomic() */ #include #include +#include #include #include @@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) return 0; pmd = pmd_offset(pud, addr); - if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd))) + if (unlikely(pmd_none(*pmd))) + return 0; + + /* + * A pmd can be bad if it refers to a HugeTLB or THP page. + * + * Both THP and HugeTLB pages have the same pmd layout + * and should not be manipulated by the pte functions. + * + * Lock the page table for the destination and check + * to see that it's still huge and whether or not we will + * need to fault on write, or if we have a splitting THP. + */ + if (unlikely(pmd_thp_or_huge(*pmd))) { + ptl = ¤t->mm->page_table_lock; + spin_lock(ptl); + if (unlikely(!pmd_thp_or_huge(*pmd) + || pmd_hugewillfault(*pmd) + || pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + return 0; + } + + *ptep = NULL; + *ptlp = ptl; + return 1; + } + + if (unlikely(pmd_bad(*pmd))) return 0; pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl); @@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) from += tocopy; n -= tocopy; - pte_unmap_unlock(pte, ptl); + if (pte) + pte_unmap_unlock(pte, ptl); + else + spin_unlock(ptl); } if (!atomic) up_read(¤t->mm->mmap_sem); @@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n) addr += tocopy; n -= tocopy; - pte_unmap_unlock(pte, ptl); + if (pte) + pte_unmap_unlock(pte, ptl); + else + spin_unlock(ptl); } up_read(¤t->mm->mmap_sem); -- cgit v0.10.2 From 728fae6f6f73b34cf94afe6c87ce102889abc786 Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Mon, 14 Oct 2013 04:42:20 +0100 Subject: ARM: 7856/1: timer-sp: remove deprecated IRQF_DISABLED This patch proposes to remove the use of the IRQF_DISABLED flag It's a NOOP since 2.6.35 and it will be removed one day. Signed-off-by: Michael Opdenacker Signed-off-by: Russell King diff --git a/arch/arm/common/timer-sp.c b/arch/arm/common/timer-sp.c index e901d0f..ce922d0 100644 --- a/arch/arm/common/timer-sp.c +++ b/arch/arm/common/timer-sp.c @@ -175,7 +175,7 @@ static struct clock_event_device sp804_clockevent = { static struct irqaction sp804_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, + .flags = IRQF_TIMER | IRQF_IRQPOLL, .handler = sp804_timer_interrupt, .dev_id = &sp804_clockevent, }; -- cgit v0.10.2 From c527c3b939cdb2e75777fe690e067b6e199c44fc Mon Sep 17 00:00:00 2001 From: Rohit Vaswani Date: Thu, 17 Oct 2013 22:07:27 +0100 Subject: ARM: 7859/1: debug: Create CONFIG_DEBUG_MSM_UART and re-organize the selects for MSM Create the hidden config DEBUG_MSM_UART and clean-up the default selection for CONFIG_DEBUG_LL_INCLUDE. Acked-by: David Brown Signed-off-by: Rohit Vaswani Signed-off-by: Russell King diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 2b32068..8e9665d 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -318,6 +318,7 @@ choice config DEBUG_MSM_UART1 bool "Kernel low-level debugging messages via MSM UART1" depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 + select DEBUG_MSM_UART help Say Y here if you want the debug print routines to direct their output to the first serial port on MSM devices. @@ -325,6 +326,7 @@ choice config DEBUG_MSM_UART2 bool "Kernel low-level debugging messages via MSM UART2" depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 + select DEBUG_MSM_UART help Say Y here if you want the debug print routines to direct their output to the second serial port on MSM devices. @@ -332,6 +334,7 @@ choice config DEBUG_MSM_UART3 bool "Kernel low-level debugging messages via MSM UART3" depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50 + select DEBUG_MSM_UART help Say Y here if you want the debug print routines to direct their output to the third serial port on MSM devices. @@ -340,6 +343,7 @@ choice bool "Kernel low-level debugging messages via MSM 8660 UART" depends on ARCH_MSM8X60 select MSM_HAS_DEBUG_UART_HS + select DEBUG_MSM_UART help Say Y here if you want the debug print routines to direct their output to the serial port on MSM 8660 devices. @@ -348,6 +352,7 @@ choice bool "Kernel low-level debugging messages via MSM 8960 UART" depends on ARCH_MSM8960 select MSM_HAS_DEBUG_UART_HS + select DEBUG_MSM_UART help Say Y here if you want the debug print routines to direct their output to the serial port on MSM 8960 devices. @@ -894,6 +899,10 @@ config DEBUG_STI_UART bool depends on ARCH_STI +config DEBUG_MSM_UART + bool + depends on ARCH_MSM + config DEBUG_LL_INCLUDE string default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250 @@ -910,11 +919,7 @@ config DEBUG_LL_INCLUDE DEBUG_IMX53_UART ||\ DEBUG_IMX6Q_UART || \ DEBUG_IMX6SL_UART - default "debug/msm.S" if DEBUG_MSM_UART1 || \ - DEBUG_MSM_UART2 || \ - DEBUG_MSM_UART3 || \ - DEBUG_MSM8660_UART || \ - DEBUG_MSM8960_UART + default "debug/msm.S" if DEBUG_MSM_UART default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1 default "debug/sti.S" if DEBUG_STI_UART -- cgit v0.10.2 From 3c8828f6a0cb3bf1bae04a98135da3c53e20c217 Mon Sep 17 00:00:00 2001 From: Rohit Vaswani Date: Thu, 17 Oct 2013 22:15:35 +0100 Subject: ARM: 7860/1: debug: msm: Add DEBUG_LL support for ARCH_MSM8974 Add debug uart support for MSM8974. This patch adds a Kconfig entry and the base address for the debug uart. Signed-off-by: Rohit Vaswani Signed-off-by: Russell King diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 8e9665d..a8f305b 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -357,6 +357,15 @@ choice Say Y here if you want the debug print routines to direct their output to the serial port on MSM 8960 devices. + config DEBUG_MSM8974_UART + bool "Kernel low-level debugging messages via MSM 8974 UART" + depends on ARCH_MSM8974 + select MSM_HAS_DEBUG_UART_HS + select DEBUG_MSM_UART + help + Say Y here if you want the debug print routines to direct + their output to the serial port on MSM 8974 devices. + config DEBUG_MVEBU_UART bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)" depends on ARCH_MVEBU diff --git a/arch/arm/include/debug/msm.S b/arch/arm/include/debug/msm.S index 9166e1b..9d653d4 100644 --- a/arch/arm/include/debug/msm.S +++ b/arch/arm/include/debug/msm.S @@ -46,6 +46,11 @@ #define MSM_DEBUG_UART_PHYS 0x16440000 #endif +#ifdef CONFIG_DEBUG_MSM8974_UART +#define MSM_DEBUG_UART_BASE 0xFA71E000 +#define MSM_DEBUG_UART_PHYS 0xF991E000 +#endif + .macro addruart, rp, rv, tmp #ifdef MSM_DEBUG_UART_PHYS ldr \rp, =MSM_DEBUG_UART_PHYS -- cgit v0.10.2 From 39792c7cf3111d69dc4aa0923859d8b929e9039f Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 18 Oct 2013 22:06:03 +0100 Subject: ARM: 7861/1: cacheflush: consolidate single-CPU ARMv7 cache disabling code This code is becoming duplicated in many places. So let's consolidate it into a handy macro that is known to be right and available for reuse. Signed-off-by: Nicolas Pitre Acked-by: Dave Martin Signed-off-by: Russell King diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 15f2d5b..ee753f1 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -435,4 +435,50 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size) #define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr)) #define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr)) +/* + * Disabling cache access for one CPU in an ARMv7 SMP system is tricky. + * To do so we must: + * + * - Clear the SCTLR.C bit to prevent further cache allocations + * - Flush the desired level of cache + * - Clear the ACTLR "SMP" bit to disable local coherency + * + * ... and so without any intervening memory access in between those steps, + * not even to the stack. + * + * WARNING -- After this has been called: + * + * - No ldrex/strex (and similar) instructions must be used. + * - The CPU is obviously no longer coherent with the other CPUs. + * - This is unlikely to work as expected if Linux is running non-secure. + * + * Note: + * + * - This is known to apply to several ARMv7 processor implementations, + * however some exceptions may exist. Caveat emptor. + * + * - The clobber list is dictated by the call to v7_flush_dcache_*. + * fp is preserved to the stack explicitly prior disabling the cache + * since adding it to the clobber list is incompatible with having + * CONFIG_FRAME_POINTER=y. ip is saved as well if ever r12-clobbering + * trampoline are inserted by the linker and to keep sp 64-bit aligned. + */ +#define v7_exit_coherency_flush(level) \ + asm volatile( \ + "stmfd sp!, {fp, ip} \n\t" \ + "mrc p15, 0, r0, c1, c0, 0 @ get SCTLR \n\t" \ + "bic r0, r0, #"__stringify(CR_C)" \n\t" \ + "mcr p15, 0, r0, c1, c0, 0 @ set SCTLR \n\t" \ + "isb \n\t" \ + "bl v7_flush_dcache_"__stringify(level)" \n\t" \ + "clrex \n\t" \ + "mrc p15, 0, r0, c1, c0, 1 @ get ACTLR \n\t" \ + "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" \ + "mcr p15, 0, r0, c1, c0, 1 @ set ACTLR \n\t" \ + "isb \n\t" \ + "dsb \n\t" \ + "ldmfd sp!, {fp, ip}" \ + : : : "r0","r1","r2","r3","r4","r5","r6","r7", \ + "r9","r10","lr","memory" ) + #endif diff --git a/arch/arm/mach-vexpress/dcscb.c b/arch/arm/mach-vexpress/dcscb.c index 3a6384c..14d4996 100644 --- a/arch/arm/mach-vexpress/dcscb.c +++ b/arch/arm/mach-vexpress/dcscb.c @@ -133,38 +133,8 @@ static void dcscb_power_down(void) if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) { arch_spin_unlock(&dcscb_lock); - /* - * Flush all cache levels for this cluster. - * - * To do so we do: - * - Clear the SCTLR.C bit to prevent further cache allocations - * - Flush the whole cache - * - Clear the ACTLR "SMP" bit to disable local coherency - * - * Let's do it in the safest possible way i.e. with - * no memory access within the following sequence - * including to the stack. - * - * Note: fp is preserved to the stack explicitly prior doing - * this since adding it to the clobber list is incompatible - * with having CONFIG_FRAME_POINTER=y. - */ - asm volatile( - "str fp, [sp, #-4]! \n\t" - "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t" - "bic r0, r0, #"__stringify(CR_C)" \n\t" - "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t" - "isb \n\t" - "bl v7_flush_dcache_all \n\t" - "clrex \n\t" - "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t" - "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" - "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t" - "isb \n\t" - "dsb \n\t" - "ldr fp, [sp], #4" - : : : "r0","r1","r2","r3","r4","r5","r6","r7", - "r9","r10","lr","memory"); + /* Flush all cache levels for this cluster. */ + v7_exit_coherency_flush(all); /* * This is a harmless no-op. On platforms with a real @@ -183,26 +153,8 @@ static void dcscb_power_down(void) } else { arch_spin_unlock(&dcscb_lock); - /* - * Flush the local CPU cache. - * Let's do it in the safest possible way as above. - */ - asm volatile( - "str fp, [sp, #-4]! \n\t" - "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t" - "bic r0, r0, #"__stringify(CR_C)" \n\t" - "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t" - "isb \n\t" - "bl v7_flush_dcache_louis \n\t" - "clrex \n\t" - "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t" - "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" - "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t" - "isb \n\t" - "dsb \n\t" - "ldr fp, [sp], #4" - : : : "r0","r1","r2","r3","r4","r5","r6","r7", - "r9","r10","lr","memory"); + /* Disable and flush the local CPU cache. */ + v7_exit_coherency_flush(louis); } __mcpm_cpu_down(cpu, cluster); diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c index e6eb481..4eb92eb 100644 --- a/arch/arm/mach-vexpress/tc2_pm.c +++ b/arch/arm/mach-vexpress/tc2_pm.c @@ -156,32 +156,7 @@ static void tc2_pm_down(u64 residency) : : "r" (0x400) ); } - /* - * We need to disable and flush the whole (L1 and L2) cache. - * Let's do it in the safest possible way i.e. with - * no memory access within the following sequence - * including the stack. - * - * Note: fp is preserved to the stack explicitly prior doing - * this since adding it to the clobber list is incompatible - * with having CONFIG_FRAME_POINTER=y. - */ - asm volatile( - "str fp, [sp, #-4]! \n\t" - "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t" - "bic r0, r0, #"__stringify(CR_C)" \n\t" - "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t" - "isb \n\t" - "bl v7_flush_dcache_all \n\t" - "clrex \n\t" - "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t" - "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" - "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t" - "isb \n\t" - "dsb \n\t" - "ldr fp, [sp], #4" - : : : "r0","r1","r2","r3","r4","r5","r6","r7", - "r9","r10","lr","memory"); + v7_exit_coherency_flush(all); cci_disable_port_by_cpu(mpidr); @@ -197,26 +172,7 @@ static void tc2_pm_down(u64 residency) arch_spin_unlock(&tc2_pm_lock); - /* - * We need to disable and flush only the L1 cache. - * Let's do it in the safest possible way as above. - */ - asm volatile( - "str fp, [sp, #-4]! \n\t" - "mrc p15, 0, r0, c1, c0, 0 @ get CR \n\t" - "bic r0, r0, #"__stringify(CR_C)" \n\t" - "mcr p15, 0, r0, c1, c0, 0 @ set CR \n\t" - "isb \n\t" - "bl v7_flush_dcache_louis \n\t" - "clrex \n\t" - "mrc p15, 0, r0, c1, c0, 1 @ get AUXCR \n\t" - "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" - "mcr p15, 0, r0, c1, c0, 1 @ set AUXCR \n\t" - "isb \n\t" - "dsb \n\t" - "ldr fp, [sp], #4" - : : : "r0","r1","r2","r3","r4","r5","r6","r7", - "r9","r10","lr","memory"); + v7_exit_coherency_flush(louis); } __mcpm_cpu_down(cpu, cluster); -- cgit v0.10.2 From 1436c1aa626d0bc0e35c5c5231127086e80ab24a Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 21 Oct 2013 13:17:08 +0100 Subject: ARM: 7862/1: pcpu: replace __get_cpu_var_uses This is the ARM part of Christoph's patchset cleaning up the various uses of __get_cpu_var across the tree. The idea is to convert __get_cpu_var into either an explicit address calculation using this_cpu_ptr() or into a use of this_cpu operations that use the offset. Thereby address calculations are avoided and fewer registers are used when code is generated. [will: fixed debug ref counting checks and pcpu array accesses] Acked-by: Catalin Marinas Signed-off-by: Christoph Lameter Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7b95de6..3d44660 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp) /* Breakpoint */ ctrl_base = ARM_BASE_BCR; val_base = ARM_BASE_BVR; - slots = (struct perf_event **)__get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); max_slots = core_num_brps; } else { /* Watchpoint */ ctrl_base = ARM_BASE_WCR; val_base = ARM_BASE_WVR; - slots = (struct perf_event **)__get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); max_slots = core_num_wrps; } @@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { /* Breakpoint */ base = ARM_BASE_BCR; - slots = (struct perf_event **)__get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); max_slots = core_num_brps; } else { /* Watchpoint */ base = ARM_BASE_WCR; - slots = (struct perf_event **)__get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); max_slots = core_num_wrps; } @@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; - slots = (struct perf_event **)__get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); for (i = 0; i < core_num_wrps; ++i) { rcu_read_lock(); @@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc) struct perf_event *wp, **slots; struct arch_hw_breakpoint *info; - slots = (struct perf_event **)__get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); for (i = 0; i < core_num_wrps; ++i) { rcu_read_lock(); @@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; - slots = (struct perf_event **)__get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); /* The exception entry code places the amended lr in the PC. */ addr = regs->ARM_pc; diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c index 170e9f3..a7b621e 100644 --- a/arch/arm/kernel/kprobes.c +++ b/arch/arm/kernel/kprobes.c @@ -171,13 +171,13 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; } static void __kprobes set_current_kprobe(struct kprobe *p) { - __get_cpu_var(current_kprobe) = p; + __this_cpu_write(current_kprobe, p); } static void __kprobes @@ -421,10 +421,10 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) continue; if (ri->rp && ri->rp->handler) { - __get_cpu_var(current_kprobe) = &ri->rp->kp; + __this_cpu_write(current_kprobe, &ri->rp->kp); get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } orig_ret_address = (unsigned long)ri->ret_addr; diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c index 8d6147b..d85055c 100644 --- a/arch/arm/kernel/perf_event_cpu.c +++ b/arch/arm/kernel/perf_event_cpu.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters); static struct pmu_hw_events *cpu_pmu_get_cpu_events(void) { - return &__get_cpu_var(cpu_hw_events); + return this_cpu_ptr(&cpu_hw_events); } static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9c697db..aea7ccb 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -65,7 +65,7 @@ static bool vgic_present; static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { BUG_ON(preemptible()); - __get_cpu_var(kvm_arm_running_vcpu) = vcpu; + __this_cpu_write(kvm_arm_running_vcpu, vcpu); } /** @@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arm_get_running_vcpu(void) { BUG_ON(preemptible()); - return __get_cpu_var(kvm_arm_running_vcpu); + return __this_cpu_read(kvm_arm_running_vcpu); } /** @@ -815,7 +815,7 @@ static void cpu_init_hyp_mode(void *dummy) boot_pgd_ptr = kvm_mmu_get_boot_httbr(); pgd_ptr = kvm_mmu_get_httbr(); - stack_page = __get_cpu_var(kvm_arm_hyp_stack_page); + stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)__kvm_hyp_vector; diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index cbfacf7..6a0a9b1 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -27,7 +27,6 @@ #include #include -#include #include #include @@ -89,8 +88,8 @@ early_param("nodebugmon", early_debug_disable); * Keep track of debug users on each core. * The ref counts are per-cpu so we use a local_t type. */ -static DEFINE_PER_CPU(local_t, mde_ref_count); -static DEFINE_PER_CPU(local_t, kde_ref_count); +static DEFINE_PER_CPU(int, mde_ref_count); +static DEFINE_PER_CPU(int, kde_ref_count); void enable_debug_monitors(enum debug_el el) { @@ -98,11 +97,11 @@ void enable_debug_monitors(enum debug_el el) WARN_ON(preemptible()); - if (local_inc_return(&__get_cpu_var(mde_ref_count)) == 1) + if (this_cpu_inc_return(mde_ref_count) == 1) enable = DBG_MDSCR_MDE; if (el == DBG_ACTIVE_EL1 && - local_inc_return(&__get_cpu_var(kde_ref_count)) == 1) + this_cpu_inc_return(kde_ref_count) == 1) enable |= DBG_MDSCR_KDE; if (enable && debug_enabled) { @@ -118,11 +117,11 @@ void disable_debug_monitors(enum debug_el el) WARN_ON(preemptible()); - if (local_dec_and_test(&__get_cpu_var(mde_ref_count))) + if (this_cpu_dec_return(mde_ref_count) == 0) disable = ~DBG_MDSCR_MDE; if (el == DBG_ACTIVE_EL1 && - local_dec_and_test(&__get_cpu_var(kde_ref_count))) + this_cpu_dec_return(kde_ref_count) == 0) disable &= ~DBG_MDSCR_KDE; if (disable) { diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 329218c..ff516f6 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -184,14 +184,14 @@ int arch_install_hw_breakpoint(struct perf_event *bp) /* Breakpoint */ ctrl_reg = AARCH64_DBG_REG_BCR; val_reg = AARCH64_DBG_REG_BVR; - slots = __get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); max_slots = core_num_brps; reg_enable = !debug_info->bps_disabled; } else { /* Watchpoint */ ctrl_reg = AARCH64_DBG_REG_WCR; val_reg = AARCH64_DBG_REG_WVR; - slots = __get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); max_slots = core_num_wrps; reg_enable = !debug_info->wps_disabled; } @@ -230,12 +230,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { /* Breakpoint */ base = AARCH64_DBG_REG_BCR; - slots = __get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); max_slots = core_num_brps; } else { /* Watchpoint */ base = AARCH64_DBG_REG_WCR; - slots = __get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); max_slots = core_num_wrps; } @@ -505,11 +505,11 @@ static void toggle_bp_registers(int reg, enum debug_el el, int enable) switch (reg) { case AARCH64_DBG_REG_BCR: - slots = __get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); max_slots = core_num_brps; break; case AARCH64_DBG_REG_WCR: - slots = __get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); max_slots = core_num_wrps; break; default: @@ -546,7 +546,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr, struct debug_info *debug_info; struct arch_hw_breakpoint_ctrl ctrl; - slots = (struct perf_event **)__get_cpu_var(bp_on_reg); + slots = this_cpu_ptr(bp_on_reg); addr = instruction_pointer(regs); debug_info = ¤t->thread.debug; @@ -596,7 +596,7 @@ unlock: user_enable_single_step(current); } else { toggle_bp_registers(AARCH64_DBG_REG_BCR, DBG_ACTIVE_EL1, 0); - kernel_step = &__get_cpu_var(stepping_kernel_bp); + kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) return 0; @@ -623,7 +623,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, struct arch_hw_breakpoint *info; struct arch_hw_breakpoint_ctrl ctrl; - slots = (struct perf_event **)__get_cpu_var(wp_on_reg); + slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; for (i = 0; i < core_num_wrps; ++i) { @@ -698,7 +698,7 @@ unlock: user_enable_single_step(current); } else { toggle_bp_registers(AARCH64_DBG_REG_WCR, DBG_ACTIVE_EL1, 0); - kernel_step = &__get_cpu_var(stepping_kernel_bp); + kernel_step = this_cpu_ptr(&stepping_kernel_bp); if (*kernel_step != ARM_KERNEL_STEP_NONE) return 0; @@ -722,7 +722,7 @@ int reinstall_suspended_bps(struct pt_regs *regs) struct debug_info *debug_info = ¤t->thread.debug; int handled_exception = 0, *kernel_step; - kernel_step = &__get_cpu_var(stepping_kernel_bp); + kernel_step = this_cpu_ptr(&stepping_kernel_bp); /* * Called from single-step exception handler. diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index cea1594..6983ed5 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1044,7 +1044,7 @@ static irqreturn_t armv8pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); - cpuc = &__get_cpu_var(cpu_hw_events); + cpuc = this_cpu_ptr(&cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; struct hw_perf_event *hwc; @@ -1257,7 +1257,7 @@ device_initcall(register_pmu_driver); static struct pmu_hw_events *armpmu_get_cpu_events(void) { - return &__get_cpu_var(cpu_hw_events); + return this_cpu_ptr(&cpu_hw_events); } static void __init cpu_pmu_init(struct arm_pmu *armpmu) -- cgit v0.10.2 From 6a5014aa037495a14ea083b621ed97fd0c3c7e9e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 22 Oct 2013 17:53:16 +0100 Subject: ARM: 7863/1: Let arm_add_memory() always use 64-bit arguments The DTB and/or the kernel command line may pass 64-bit addresses regardless of kernel configuration, so update arm_add_memory() to take 64-bit arguments independently of the phys_addr_t size. This allows non-wrapping handling of high memory banks such as the second memory bank of APE6EVM (at 0x2_0000_0000) in case of 32-bit phys_addr_t. Signed-off-by: Magnus Damm Signed-off-by: Russell King diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h index c50f0560..8d6a089 100644 --- a/arch/arm/include/asm/setup.h +++ b/arch/arm/include/asm/setup.h @@ -49,7 +49,7 @@ extern struct meminfo meminfo; #define bank_phys_end(bank) ((bank)->start + (bank)->size) #define bank_phys_size(bank) (bank)->size -extern int arm_add_memory(phys_addr_t start, phys_addr_t size); +extern int arm_add_memory(u64 start, u64 size); extern void early_print(const char *str, ...); extern void dump_machine_table(void); diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a4852de..5ec4443a 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -621,7 +621,7 @@ void __init dump_machine_table(void) /* can't use cpu_relax() here as it may require MMU setup */; } -int __init arm_add_memory(phys_addr_t start, phys_addr_t size) +int __init arm_add_memory(u64 start, u64 size) { struct membank *bank = &meminfo.bank[meminfo.nr_banks]; @@ -671,8 +671,8 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size) static int __init early_mem(char *p) { static int usermem __initdata = 0; - phys_addr_t size; - phys_addr_t start; + u64 size; + u64 start; char *endp; /* -- cgit v0.10.2 From 6d7d5da7d75c6df676c8b72d32b02ff024438f0c Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 22 Oct 2013 17:59:54 +0100 Subject: ARM: 7864/1: Handle 64-bit memory in case of 32-bit phys_addr_t Use CONFIG_ARCH_PHYS_ADDR_T_64BIT to determine if ignoring or truncating of memory banks is neccessary. This may be needed in the case of 64-bit memory bank addresses but when phys_addr_t is kept 32-bit. Signed-off-by: Magnus Damm Signed-off-by: Russell King diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 5ec4443a..53c3901 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -624,6 +624,7 @@ void __init dump_machine_table(void) int __init arm_add_memory(u64 start, u64 size) { struct membank *bank = &meminfo.bank[meminfo.nr_banks]; + u64 aligned_start; if (meminfo.nr_banks >= NR_BANKS) { printk(KERN_CRIT "NR_BANKS too low, " @@ -636,10 +637,16 @@ int __init arm_add_memory(u64 start, u64 size) * Size is appropriately rounded down, start is rounded up. */ size -= start & ~PAGE_MASK; - bank->start = PAGE_ALIGN(start); + aligned_start = PAGE_ALIGN(start); -#ifndef CONFIG_ARM_LPAE - if (bank->start + size < bank->start) { +#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT + if (aligned_start > ULONG_MAX) { + printk(KERN_CRIT "Ignoring memory at 0x%08llx outside " + "32-bit physical address space\n", (long long)start); + return -EINVAL; + } + + if (aligned_start + size > ULONG_MAX) { printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in " "32-bit physical address space\n", (long long)start); /* @@ -647,10 +654,11 @@ int __init arm_add_memory(u64 start, u64 size) * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB. * This means we lose a page after masking. */ - size = ULONG_MAX - bank->start; + size = ULONG_MAX - aligned_start; } #endif + bank->start = aligned_start; bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1); /* -- cgit v0.10.2 From 49649cad34c29dc26ed928191425e6677ca07918 Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Sun, 27 Oct 2013 09:20:47 +0100 Subject: ARM: 7869/1: remove unused XSCALE_PMU Kconfig param This removes the XSCALE_PMU Kconfig param, which is defined but no longer used in makefiles and source files. Signed-off-by: Michael Opdenacker Signed-off-by: Russell King diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fc184bc..df0c609 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1092,11 +1092,6 @@ config IWMMXT Enable support for iWMMXt context switching at run time if running on a CPU that supports it. -config XSCALE_PMU - bool - depends on CPU_XSCALE - default y - config MULTI_IRQ_HANDLER bool help -- cgit v0.10.2 From 1e5660999aa7703654ec345caaf06b83415dbdff Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 1 Oct 2013 19:57:28 +0100 Subject: ARM: 7847/1: mcpm: Factor out logical-to-physical CPU translation This patch factors the logical-to-physical CPU translation out of mcpm_boot_secondary(), so that it can be reused elsewhere. Signed-off-by: Dave Martin Acked-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c index 1bc34c7..c0c3cd7 100644 --- a/arch/arm/common/mcpm_platsmp.c +++ b/arch/arm/common/mcpm_platsmp.c @@ -19,14 +19,23 @@ #include #include +static void cpu_to_pcpu(unsigned int cpu, + unsigned int *pcpu, unsigned int *pcluster) +{ + unsigned int mpidr; + + mpidr = cpu_logical_map(cpu); + *pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); + *pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); +} + static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle) { - unsigned int mpidr, pcpu, pcluster, ret; + unsigned int pcpu, pcluster, ret; extern void secondary_startup(void); - mpidr = cpu_logical_map(cpu); - pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); - pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + cpu_to_pcpu(cpu, &pcpu, &pcluster); + pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n", __func__, cpu, pcpu, pcluster); -- cgit v0.10.2 From 0de0d64675259bf21d06b18985318ffb66a5218f Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 1 Oct 2013 19:58:17 +0100 Subject: ARM: 7848/1: mcpm: Implement cpu_kill() to synchronise on powerdown CPU hotplug and kexec rely on smp_ops.cpu_kill(), which is supposed to wait for the CPU to park or power down, and perform the last rites (such as disabling clocks etc., where the platform doesn't do this automatically). kexec in particular is unsafe without performing this synchronisation to park secondaries. Without it, the secondaries might not be parked when kexec trashes the kernel. There is no generic way to do this synchronisation, so a new mcpm platform_ops method power_down_finish() is added by this patch. The new method is mandatory. A platform which provides no way to detect when CPUs are parked is likely broken. Signed-off-by: Dave Martin Reviewed-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 9902509..6c03d01 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -90,6 +90,21 @@ void mcpm_cpu_power_down(void) BUG(); } +int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster) +{ + int ret; + + if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish)) + return -EUNATCH; + + ret = platform_ops->power_down_finish(cpu, cluster); + if (ret) + pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n", + __func__, cpu, cluster, ret); + + return ret; +} + void mcpm_cpu_suspend(u64 expected_residency) { phys_reset_t phys_reset; diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c index c0c3cd7..177251a 100644 --- a/arch/arm/common/mcpm_platsmp.c +++ b/arch/arm/common/mcpm_platsmp.c @@ -56,6 +56,15 @@ static void mcpm_secondary_init(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU +static int mcpm_cpu_kill(unsigned int cpu) +{ + unsigned int pcpu, pcluster; + + cpu_to_pcpu(cpu, &pcpu, &pcluster); + + return !mcpm_cpu_power_down_finish(pcpu, pcluster); +} + static int mcpm_cpu_disable(unsigned int cpu) { /* @@ -82,6 +91,7 @@ static struct smp_operations __initdata mcpm_smp_ops = { .smp_boot_secondary = mcpm_boot_secondary, .smp_secondary_init = mcpm_secondary_init, #ifdef CONFIG_HOTPLUG_CPU + .cpu_kill = mcpm_cpu_kill, .cpu_disable = mcpm_cpu_disable, .cpu_die = mcpm_cpu_die, #endif diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index fc82a88..1cf2601 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -81,10 +81,40 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster); * * This will return if mcpm_platform_register() has not been called * previously in which case the caller should take appropriate action. + * + * On success, the CPU is not guaranteed to be truly halted until + * mcpm_cpu_power_down_finish() subsequently returns non-zero for the + * specified cpu. Until then, other CPUs should make sure they do not + * trash memory the target CPU might be executing/accessing. */ void mcpm_cpu_power_down(void); /** + * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and + * make sure it is powered off + * + * @cpu: CPU number within given cluster + * @cluster: cluster number for the CPU + * + * Call this function to ensure that a pending powerdown has taken + * effect and the CPU is safely parked before performing non-mcpm + * operations that may affect the CPU (such as kexec trashing the + * kernel text). + * + * It is *not* necessary to call this function if you only need to + * serialise a pending powerdown with mcpm_cpu_power_up() or a wakeup + * event. + * + * Do not call this function unless the specified CPU has already + * called mcpm_cpu_power_down() or has committed to doing so. + * + * @return: + * - zero if the CPU is in a safely parked state + * - nonzero otherwise (e.g., timeout) + */ +int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster); + +/** * mcpm_cpu_suspend - bring the calling CPU in a suspended state * * @expected_residency: duration in microseconds the CPU is expected @@ -126,6 +156,7 @@ int mcpm_cpu_powered_up(void); struct mcpm_platform_ops { int (*power_up)(unsigned int cpu, unsigned int cluster); void (*power_down)(void); + int (*power_down_finish)(unsigned int cpu, unsigned int cluster); void (*suspend)(u64); void (*powered_up)(void); }; -- cgit v0.10.2 From 5e4432d3bd6b5b19e10bb263e7dbe8e74d7cf1c2 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 29 Oct 2013 23:06:44 +0000 Subject: ARM: fix misplaced arch_virt_to_idmap() Olof Johansson reported: In file included from arch/arm/include/asm/page.h:163:0, from include/linux/mm_types.h:16, from include/linux/sched.h:24, from arch/arm/kernel/asm-offsets.c:13: arch/arm/include/asm/memory.h: In function '__virt_to_idmap': arch/arm/include/asm/memory.h:300:6: error: 'arch_virt_to_idmap' undeclared (first use in this function) caused by arch_virt_to_idmap being placed inside a different preprocessor conditional to its user. Move it along side its user. Signed-off-by: Russell King diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 6748d62..4dd2145 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -174,7 +174,6 @@ #define __PV_BITS_31_24 0x81000000 #define __PV_BITS_7_0 0x81 -extern phys_addr_t (*arch_virt_to_idmap) (unsigned long x); extern u64 __pv_phys_offset; extern u64 __pv_offset; extern void fixup_pv_table(const void *, unsigned long); @@ -290,6 +289,8 @@ static inline void *phys_to_virt(phys_addr_t x) #define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x))) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x); + /* * These are for systems that have a hardware interconnect supported alias of * physical memory for idmap purposes. Most cases should leave these -- cgit v0.10.2 From 384b38b66947b06999b3e39a596d4f2fb94f77e4 Mon Sep 17 00:00:00 2001 From: Yuanyuan Zhong Date: Wed, 30 Oct 2013 17:31:49 +0100 Subject: ARM: 7873/1: vfp: clear vfp_current_hw_state for dying cpu The CPU_DYING notifier is called by cpu stopper task which does not own the context held in the VFP hardware. Calling vfp_force_reload() has no effect. Replace it with clearing vfp_current_hw_state. Signed-off-by: Yuanyuan Zhong Signed-off-by: Russell King diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 52b8f40..2f37e1d 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -642,9 +642,9 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp, static int vfp_hotplug(struct notifier_block *b, unsigned long action, void *hcpu) { - if (action == CPU_DYING || action == CPU_DYING_FROZEN) { - vfp_force_reload((long)hcpu, current_thread_info()); - } else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN) + if (action == CPU_DYING || action == CPU_DYING_FROZEN) + vfp_current_hw_state[(long)hcpu] = NULL; + else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN) vfp_enable(NULL); return NOTIFY_OK; } -- cgit v0.10.2 From 8d451442549a2696880288ba6325f9681451231b Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 31 Oct 2013 10:52:05 +0000 Subject: ARM: footbridge: fix build warnings for netwinder arch/arm/mach-footbridge/netwinder-hw.c:695:2: warning: passing argument 1 of 'spinlock_check' from incompatible pointer type arch/arm/mach-footbridge/netwinder-hw.c:702:2: warning: passing argument 1 of 'spin_unlock_irqrestore' from incompatible pointer type arch/arm/mach-footbridge/netwinder-hw.c:712:2: warning: passing argument 1 of 'spinlock_check' from incompatible pointer type arch/arm/mach-footbridge/netwinder-hw.c:714:2: warning: passing argument 1 of 'spin_unlock_irqrestore' from incompatible pointer type Signed-off-by: Russell King diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index 1fd2cf0..eb1fa5c 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -692,14 +692,14 @@ static void netwinder_led_set(struct led_classdev *cdev, unsigned long flags; u32 reg; - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); reg = nw_gpio_read(); if (b != LED_OFF) reg &= ~led->mask; else reg |= led->mask; nw_gpio_modify_op(led->mask, reg); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); } static enum led_brightness netwinder_led_get(struct led_classdev *cdev) @@ -709,9 +709,9 @@ static enum led_brightness netwinder_led_get(struct led_classdev *cdev) unsigned long flags; u32 reg; - spin_lock_irqsave(&nw_gpio_lock, flags); + raw_spin_lock_irqsave(&nw_gpio_lock, flags); reg = nw_gpio_read(); - spin_unlock_irqrestore(&nw_gpio_lock, flags); + raw_spin_unlock_irqrestore(&nw_gpio_lock, flags); return (reg & led->mask) ? LED_OFF : LED_FULL; } -- cgit v0.10.2 From b0ced9d220f78f27b93e4d1024b3865ba172dbce Mon Sep 17 00:00:00 2001 From: Tushar Behera Date: Thu, 31 Oct 2013 06:46:14 +0100 Subject: ARM: 7874/2: bL_switcher: Remove cpu_hotplug_driver_{lock,unlock}() Commit 6dedcca610c6 ("hotplug, powerpc, x86: Remove cpu_hotplug_driver_lock())" removes the the definition of cpu_hotplug_driver_{lock,unlock} APIs, thereby causing a build error. Replace these calls with {lock,unlock}_device_hotplug(). Signed-off-by: Tushar Behera Signed-off-by: Nicolas Pitre Signed-off-by: Russell King diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 63bbc4f..5774b6e 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -577,9 +577,9 @@ static int bL_switcher_enable(void) int cpu, ret; mutex_lock(&bL_switcher_activation_lock); - cpu_hotplug_driver_lock(); + lock_device_hotplug(); if (bL_switcher_active) { - cpu_hotplug_driver_unlock(); + unlock_device_hotplug(); mutex_unlock(&bL_switcher_activation_lock); return 0; } @@ -615,7 +615,7 @@ error: bL_activation_notify(BL_NOTIFY_POST_DISABLE); out: - cpu_hotplug_driver_unlock(); + unlock_device_hotplug(); mutex_unlock(&bL_switcher_activation_lock); return ret; } @@ -629,7 +629,7 @@ static void bL_switcher_disable(void) struct task_struct *task; mutex_lock(&bL_switcher_activation_lock); - cpu_hotplug_driver_lock(); + lock_device_hotplug(); if (!bL_switcher_active) goto out; @@ -685,7 +685,7 @@ static void bL_switcher_disable(void) bL_activation_notify(BL_NOTIFY_POST_DISABLE); out: - cpu_hotplug_driver_unlock(); + unlock_device_hotplug(); mutex_unlock(&bL_switcher_activation_lock); } -- cgit v0.10.2 From e16b31bf47738f4498d7ce632e12d7d2a6a2492a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Nov 2013 11:42:29 +0100 Subject: ARM: 7876/1: clear Thumb-2 IT state on exception handling The exception handling code fails to clear the IT state, potentially leading to incorrect execution of the fixup if the size of the IT block is more than one. Let fixup_exception do the IT sanitizing if a fixup has been found, and restore CPSR from the stack when returning from a data abort. Cc: Will Deacon Cc: stable@vger.kernel.org Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier Signed-off-by: Russell King diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9cbe70c..ec3e5cf 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -192,6 +192,7 @@ __dabt_svc: svc_entry mov r2, sp dabt_helper + THUMB( ldr r5, [sp, #S_PSR] ) @ potentially updated CPSR svc_exit r5 @ return from exception UNWIND(.fnend ) ENDPROC(__dabt_svc) diff --git a/arch/arm/mm/extable.c b/arch/arm/mm/extable.c index 9d28562..312e15e 100644 --- a/arch/arm/mm/extable.c +++ b/arch/arm/mm/extable.c @@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs) const struct exception_table_entry *fixup; fixup = search_exception_tables(instruction_pointer(regs)); - if (fixup) + if (fixup) { regs->ARM_pc = fixup->fixup; +#ifdef CONFIG_THUMB2_KERNEL + /* Clear the IT state to avoid nasty surprises in the fixup */ + regs->ARM_cpsr &= ~PSR_IT_MASK; +#endif + } return fixup != NULL; } -- cgit v0.10.2 From 70d42126877b9faa272d446a6de5917614c28dd9 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 6 Nov 2013 09:12:40 +0100 Subject: ARM: 7878/1: nommu: Implement dummy early_paging_init() No-MMU configurations currenty fail to build because they are missing the early_paging_init() symbol. Acked-by: Olof Johansson Signed-off-by: Thierry Reding Signed-off-by: Russell King diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 34d4ab2..5c668b7 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -296,6 +296,15 @@ void __init sanity_check_meminfo(void) } /* + * early_paging_init() recreates boot time page table setup, allowing machines + * to switch over to a high (>4G) address space on LPAE systems + */ +void __init early_paging_init(const struct machine_desc *mdesc, + struct proc_info_list *procinfo) +{ +} + +/* * paging_init() sets up the page tables, initialises the zone memory * maps, and sets up the zero page, bad page and bad page tables. */ -- cgit v0.10.2 From 6ecf830e5029598732e04067e325d946097519cb Mon Sep 17 00:00:00 2001 From: "T.J. Purtell" Date: Wed, 6 Nov 2013 18:38:05 +0100 Subject: ARM: 7880/1: Clear the IT state independent of the Thumb-2 mode The ARM architecture reference specifies that the IT state bits in the PSR must be all zeros in ARM mode or behavior is unspecified. On the Qualcomm Snapdragon S4/Krait architecture CPUs the processor continues to consider the IT state bits while in ARM mode. This makes it so that some instructions are skipped by the CPU. Signed-off-by: T.J. Purtell [rmk+kernel@arm.linux.org.uk: fixed whitespace formatting in patch] Signed-off-by: Russell King diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ab33042..1aa5ecd 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -375,12 +375,18 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig, */ thumb = handler & 1; - if (thumb) { - cpsr |= PSR_T_BIT; #if __LINUX_ARM_ARCH__ >= 7 - /* clear the If-Then Thumb-2 execution state */ - cpsr &= ~PSR_IT_MASK; + /* + * Clear the If-Then Thumb-2 execution state + * ARM spec requires this to be all 000s in ARM mode + * Snapdragon S4/Krait misbehaves on a Thumb=>ARM + * signal transition without this. + */ + cpsr &= ~PSR_IT_MASK; #endif + + if (thumb) { + cpsr |= PSR_T_BIT; } else cpsr &= ~PSR_T_BIT; } -- cgit v0.10.2 From bf18525fd793101df42a1344ecc48b49b62e48c9 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 29 Oct 2013 20:32:56 +0100 Subject: ARM: 7872/1: Support arch_irq_work_raise() via self IPIs By default, IRQ work is run from the tick interrupt (see irq_work_run() in update_process_times()). When we're in full NOHZ mode, restarting the tick requires the use of IRQ work and if the only place we run IRQ work is in the tick interrupt we have an unbreakable cycle. Implement arch_irq_work_raise() via self IPIs to break this cycle and get the tick started again. Note that we implement this via IPIs which are only available on SMP builds. This shouldn't be a problem because full NOHZ is only supported on SMP builds anyway. Signed-off-by: Stephen Boyd Reviewed-by: Kevin Hilman Cc: Frederic Weisbecker Signed-off-by: Russell King diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index 2740c2a..3d7351c 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -5,7 +5,7 @@ #include #include -#define NR_IPI 6 +#define NR_IPI 7 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 72024ea..bf9a0d6 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -66,6 +67,7 @@ enum ipi_msg_type { IPI_CALL_FUNC, IPI_CALL_FUNC_SINGLE, IPI_CPU_STOP, + IPI_IRQ_WORK, }; static DECLARE_COMPLETION(cpu_running); @@ -448,6 +450,13 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); } +#ifdef CONFIG_IRQ_WORK +void arch_irq_work_raise(void) +{ + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); +} +#endif + static const char *ipi_types[NR_IPI] = { #define S(x,s) [x] = s S(IPI_WAKEUP, "CPU wakeup interrupts"), @@ -456,6 +465,7 @@ static const char *ipi_types[NR_IPI] = { S(IPI_CALL_FUNC, "Function call interrupts"), S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), S(IPI_CPU_STOP, "CPU stop interrupts"), + S(IPI_IRQ_WORK, "IRQ work interrupts"), }; void show_ipi_list(struct seq_file *p, int prec) @@ -565,6 +575,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) irq_exit(); break; +#ifdef CONFIG_IRQ_WORK + case IPI_IRQ_WORK: + irq_enter(); + irq_work_run(); + irq_exit(); + break; +#endif + default: printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); -- cgit v0.10.2 From c682e51dbc9837f4aa61180775e9ca4f2a463adf Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Sat, 9 Nov 2013 00:38:24 +0100 Subject: ARM: 7887/1: Don't smp_cross_call() on UP devices in arch_irq_work_raise() If we're running a kernel compiled with SMP_ON_UP=y and the hardware only supports UP operation there isn't any smp_cross_call function assigned. Unfortunately, we call smp_cross_call() unconditionally in arch_irq_work_raise() and crash the kernel on UP devices. Check to make sure we're running on an SMP device before calling smp_cross_call() here. Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = c0004000 [00000000] *pgd=00000000 Internal error: Oops: 80000005 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.12.0-rc6-00018-g8d45144-dirty #16 task: de05b440 ti: de05c000 task.ti: de05c000 PC is at 0x0 LR is at arch_irq_work_raise+0x3c/0x48 pc : [<00000000>] lr : [] psr: 60000193 sp : de05dd60 ip : 00000001 fp : 00000000 r10: c085e2f0 r9 : de05c000 r8 : c07be0a4 r7 : de05c000 r6 : de05c000 r5 : c07c5778 r4 : c0824554 r3 : 00000000 r2 : 00000000 r1 : 00000006 r0 : c0529a58 Flags: nZCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment kernel Control: 10c5387d Table: 80004019 DAC: 00000017 Process swapper/0 (pid: 1, stack limit = 0xde05c248) Stack: (0xde05dd60 to 0xde05e000) dd60: c07b9dbc c00cb2dc 00000001 c08242c0 c08242c0 60000113 c07be0a8 c00b0590 dd80: de05c000 c085e2f0 c08242c0 c08242c0 c1414c28 c00b07cc de05b440 c1414c28 dda0: c08242c0 c00b0af8 c0862bb0 c0862db0 c1414cd8 de05c028 c0824840 de05ddb8 ddc0: 00000000 00000009 00000001 00000024 c07be0a8 c07be0a4 de05c000 c085e2f0 dde0: 00000000 c004a4b0 00000010 de00d2dc 00000054 00000100 00000024 00000000 de00: de05c028 0000000a ffff8ae7 00200040 00000016 de05c000 60000193 de05c000 de20: 00000054 00000000 00000000 00000000 00000000 c004a704 00000000 de05c008 de40: c07ba254 c004aa1c c07c5778 c0014b70 fa200000 00000054 de05de80 c0861244 de60: 00000000 c0008634 de05b440 c051c778 20000113 ffffffff de05deb4 c051d0a4 de80: 00000001 00000001 00000000 de05b440 c082afac de057ac0 de057ac0 de0443c0 dea0: 00000000 00000000 00000000 00000000 c082afbc de05dec8 c009f2a0 c051c778 dec0: 20000113 ffffffff 00000000 c016edb0 00000000 000002b0 de057ac0 de057ac0 dee0: 00000000 c016ee40 c0875e50 de05df2e de057ac0 00000000 00000013 00000000 df00: 00000000 c016f054 de043600 de0443c0 c008eb38 de004ec0 c0875e50 c008eb44 df20: 00000012 00000000 00000000 3931f0f8 00000000 00000000 00000014 c0822e84 df40: 00000000 c008ed2c 00000000 00000000 00000000 c07b7490 c07b7490 c075ab3c df60: 00000000 c00701ac 00000002 00000000 c0070160 dffadb73 7bf8edb4 00000000 df80: c051092c 00000000 00000000 00000000 00000000 00000000 00000000 c0510934 dfa0: de05aa40 00000000 c051092c c0013ce8 00000000 00000000 00000000 00000000 dfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 dfe0: 00000000 00000000 00000000 00000000 00000013 00000000 07efffe5 4dfac6f5 [] (arch_irq_work_raise+0x3c/0x48) from [] (irq_work_queue+0xe4/0xf8) [] (irq_work_queue+0xe4/0xf8) from [] (rcu_accelerate_cbs+0x1d4/0x1d8) [] (rcu_accelerate_cbs+0x1d4/0x1d8) from [] (rcu_start_gp+0x34/0x48) [] (rcu_start_gp+0x34/0x48) from [] (rcu_process_callbacks+0x318/0x608) [] (rcu_process_callbacks+0x318/0x608) from [] (__do_softirq+0x114/0x2a0) [] (__do_softirq+0x114/0x2a0) from [] (do_softirq+0x6c/0x74) [] (do_softirq+0x6c/0x74) from [] (irq_exit+0xac/0x100) [] (irq_exit+0xac/0x100) from [] (handle_IRQ+0x54/0xb4) [] (handle_IRQ+0x54/0xb4) from [] (omap3_intc_handle_irq+0x60/0x74) [] (omap3_intc_handle_irq+0x60/0x74) from [] (__irq_svc+0x44/0x5c) Exception stack(0xde05de80 to 0xde05dec8) de80: 00000001 00000001 00000000 de05b440 c082afac de057ac0 de057ac0 de0443c0 dea0: 00000000 00000000 00000000 00000000 c082afbc de05dec8 c009f2a0 c051c778 dec0: 20000113 ffffffff [] (__irq_svc+0x44/0x5c) from [] (_raw_spin_unlock_irq+0x28/0x2c) [] (_raw_spin_unlock_irq+0x28/0x2c) from [] (proc_alloc_inum+0x30/0xa8) [] (proc_alloc_inum+0x30/0xa8) from [] (proc_register+0x18/0x130) [] (proc_register+0x18/0x130) from [] (proc_mkdir_data+0x44/0x6c) [] (proc_mkdir_data+0x44/0x6c) from [] (register_irq_proc+0x6c/0x128) [] (register_irq_proc+0x6c/0x128) from [] (init_irq_proc+0x74/0xb0) [] (init_irq_proc+0x74/0xb0) from [] (kernel_init_freeable+0x84/0x1c8) [] (kernel_init_freeable+0x84/0x1c8) from [] (kernel_init+0x8/0x150) [] (kernel_init+0x8/0x150) from [] (ret_from_fork+0x14/0x2c) Code: bad PC value Fixes: bf18525fd79 "ARM: 7872/1: Support arch_irq_work_raise() via self IPIs" Reported-by: Olof Johansson Signed-off-by: Stephen Boyd Tested-by: Olof Johansson Signed-off-by: Russell King diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index bf9a0d6..e115cbb 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -453,7 +453,8 @@ void arch_send_call_function_single_ipi(int cpu) #ifdef CONFIG_IRQ_WORK void arch_irq_work_raise(void) { - smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); + if (is_smp()) + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); } #endif -- cgit v0.10.2 From 4d8981f6b784ae445a28de909d77e27836c3ed44 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Tue, 29 Oct 2013 16:02:00 +0100 Subject: ARM: 7871/1: amba: Extend number of IRQS Xilinx Zynq pl330 dma driver has 9 irqs which all have to be used by the driver to get it work properly. Signed-off-by: Michal Simek Signed-off-by: Russell King diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 43ec7e2..b327a1b 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -21,7 +21,7 @@ #include #include -#define AMBA_NR_IRQS 2 +#define AMBA_NR_IRQS 9 #define AMBA_CID 0xb105f00d struct clk; -- cgit v0.10.2 From 237f12337cfa2175474e4dd015bc07a25eb9080d Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 26 Oct 2013 15:07:04 +0100 Subject: ARM: 7866/1: include: asm: use 'long long' instead of 'u64' within atomic.h atomic* value is signed value, and atomic* functions need also process signed value (parameter value, and return value), so 32-bit arm need use 'long long' instead of 'u64'. After replacement, it will also fix a bug for atomic64_add_negative(): "u64 is never less than 0". The modifications are: in vim, use "1,% s/\/long long/g" command. remove '__aligned(8)' which is useless for 64-bit. be sure of 80 column limitation after replacement. Acked-by: Will Deacon Signed-off-by: Chen Gang Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index da1c77d..a715ac0 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -238,15 +238,15 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) #ifndef CONFIG_GENERIC_ATOMIC64 typedef struct { - u64 __aligned(8) counter; + long long counter; } atomic64_t; #define ATOMIC64_INIT(i) { (i) } #ifdef CONFIG_ARM_LPAE -static inline u64 atomic64_read(const atomic64_t *v) +static inline long long atomic64_read(const atomic64_t *v) { - u64 result; + long long result; __asm__ __volatile__("@ atomic64_read\n" " ldrd %0, %H0, [%1]" @@ -257,7 +257,7 @@ static inline u64 atomic64_read(const atomic64_t *v) return result; } -static inline void atomic64_set(atomic64_t *v, u64 i) +static inline void atomic64_set(atomic64_t *v, long long i) { __asm__ __volatile__("@ atomic64_set\n" " strd %2, %H2, [%1]" @@ -266,9 +266,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i) ); } #else -static inline u64 atomic64_read(const atomic64_t *v) +static inline long long atomic64_read(const atomic64_t *v) { - u64 result; + long long result; __asm__ __volatile__("@ atomic64_read\n" " ldrexd %0, %H0, [%1]" @@ -279,9 +279,9 @@ static inline u64 atomic64_read(const atomic64_t *v) return result; } -static inline void atomic64_set(atomic64_t *v, u64 i) +static inline void atomic64_set(atomic64_t *v, long long i) { - u64 tmp; + long long tmp; __asm__ __volatile__("@ atomic64_set\n" "1: ldrexd %0, %H0, [%2]\n" @@ -294,9 +294,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i) } #endif -static inline void atomic64_add(u64 i, atomic64_t *v) +static inline void atomic64_add(long long i, atomic64_t *v) { - u64 result; + long long result; unsigned long tmp; __asm__ __volatile__("@ atomic64_add\n" @@ -311,9 +311,9 @@ static inline void atomic64_add(u64 i, atomic64_t *v) : "cc"); } -static inline u64 atomic64_add_return(u64 i, atomic64_t *v) +static inline long long atomic64_add_return(long long i, atomic64_t *v) { - u64 result; + long long result; unsigned long tmp; smp_mb(); @@ -334,9 +334,9 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v) return result; } -static inline void atomic64_sub(u64 i, atomic64_t *v) +static inline void atomic64_sub(long long i, atomic64_t *v) { - u64 result; + long long result; unsigned long tmp; __asm__ __volatile__("@ atomic64_sub\n" @@ -351,9 +351,9 @@ static inline void atomic64_sub(u64 i, atomic64_t *v) : "cc"); } -static inline u64 atomic64_sub_return(u64 i, atomic64_t *v) +static inline long long atomic64_sub_return(long long i, atomic64_t *v) { - u64 result; + long long result; unsigned long tmp; smp_mb(); @@ -374,9 +374,10 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v) return result; } -static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new) +static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old, + long long new) { - u64 oldval; + long long oldval; unsigned long res; smp_mb(); @@ -398,9 +399,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new) return oldval; } -static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new) +static inline long long atomic64_xchg(atomic64_t *ptr, long long new) { - u64 result; + long long result; unsigned long tmp; smp_mb(); @@ -419,9 +420,9 @@ static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new) return result; } -static inline u64 atomic64_dec_if_positive(atomic64_t *v) +static inline long long atomic64_dec_if_positive(atomic64_t *v) { - u64 result; + long long result; unsigned long tmp; smp_mb(); @@ -445,9 +446,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v) return result; } -static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u) +static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u) { - u64 val; + long long val; unsigned long tmp; int ret = 1; -- cgit v0.10.2 From 4dcc1cf7316a26e112f5c9fcca531ff98ef44700 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 26 Oct 2013 15:07:25 +0100 Subject: ARM: 7867/1: include: asm: use 'int' instead of 'unsigned long' for 'oldval' in atomic_cmpxchg(). For atomic_cmpxchg(), the type of 'oldval' need be 'int' to match the type of "*ptr" (used by 'ldrex' instruction) and 'old' (used by 'teq' instruction). Reviewed-by: Will Deacon Signed-off-by: Chen Gang Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index a715ac0..9ee7e01 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -114,7 +114,8 @@ static inline int atomic_sub_return(int i, atomic_t *v) static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { - unsigned long oldval, res; + int oldval; + unsigned long res; smp_mb(); -- cgit v0.10.2 From aed3a4ed7222ae37860ae33cbad3ea7a6b2eaf8e Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 26 Oct 2013 15:07:41 +0100 Subject: ARM: 7868/1: arm/arm64: remove atomic_clear_mask() in "include/asm/atomic.h" In current kernel wide source code, except other architectures, only s390 scsi drivers use atomic_clear_mask(), and arm/arm64 need not support s390 drivers. So remove atomic_clear_mask() from "arm[64]/include/asm/atomic.h". Signed-off-by: Chen Gang Signed-off-by: Will Deacon Signed-off-by: Russell King diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 9ee7e01..f8a4336 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -135,21 +135,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) return oldval; } -static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) -{ - unsigned long tmp, tmp2; - - __asm__ __volatile__("@ atomic_clear_mask\n" -"1: ldrex %0, [%3]\n" -" bic %0, %0, %4\n" -" strex %1, %0, [%3]\n" -" teq %1, #0\n" -" bne 1b" - : "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr) - : "r" (addr), "Ir" (mask) - : "cc"); -} - #else /* ARM_ARCH_6 */ #ifdef CONFIG_SMP @@ -198,15 +183,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new) return ret; } -static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) -{ - unsigned long flags; - - raw_local_irq_save(flags); - *addr &= ~mask; - raw_local_irq_restore(flags); -} - #endif /* __LINUX_ARM_ARCH__ */ #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index 8363644..01de5aa 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -126,20 +126,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) return oldval; } -static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) -{ - unsigned long tmp, tmp2; - - asm volatile("// atomic_clear_mask\n" -"1: ldxr %0, %2\n" -" bic %0, %0, %3\n" -" stxr %w1, %0, %2\n" -" cbnz %w1, 1b" - : "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr) - : "Ir" (mask) - : "cc"); -} - #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) static inline int __atomic_add_unless(atomic_t *v, int a, int u) -- cgit v0.10.2