From 8cb87c0407cb55277d8b9aa50f0e29201b90a88d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 17:01:46 -0500 Subject: nouveau/nvkm/subdev/clk/gk20a.c: fix wrong do_div() usage do_div() must only be used with a u64 dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c index 254094a..5da2aa8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/clk/gk20a.c @@ -141,9 +141,8 @@ gk20a_pllg_calc_rate(struct gk20a_clk *clk) rate = clk->parent_rate * clk->n; divider = clk->m * pl_to_div[clk->pl]; - do_div(rate, divider); - return rate / 2; + return rate / divider / 2; } static int -- cgit v0.10.2 From 6a0078c8520697ab3e32180ede2df35e7630d94b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 19:46:23 -0500 Subject: imx/clk-pllv1: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/clk/imx/clk-pllv1.c b/drivers/clk/imx/clk-pllv1.c index 8564e43..82fe366 100644 --- a/drivers/clk/imx/clk-pllv1.c +++ b/drivers/clk/imx/clk-pllv1.c @@ -52,7 +52,7 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) { struct clk_pllv1 *pll = to_clk_pllv1(hw); - long long ll; + unsigned long long ull; int mfn_abs; unsigned int mfi, mfn, mfd, pd; u32 reg; @@ -94,16 +94,16 @@ static unsigned long clk_pllv1_recalc_rate(struct clk_hw *hw, rate = parent_rate * 2; rate /= pd + 1; - ll = (unsigned long long)rate * mfn_abs; + ull = (unsigned long long)rate * mfn_abs; - do_div(ll, mfd + 1); + do_div(ull, mfd + 1); if (mfn_is_negative(pll, mfn)) - ll = -ll; + ull = (rate * mfi) - ull; + else + ull = (rate * mfi) + ull; - ll = (rate * mfi) + ll; - - return ll; + return ull; } static struct clk_ops clk_pllv1_ops = { -- cgit v0.10.2 From 4471f9a4db2d8dfdc79618502fafb1172ab40f05 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 20:01:40 -0500 Subject: imx/clk-pllv2: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/clk/imx/clk-pllv2.c b/drivers/clk/imx/clk-pllv2.c index b18f875..4aeda56 100644 --- a/drivers/clk/imx/clk-pllv2.c +++ b/drivers/clk/imx/clk-pllv2.c @@ -79,7 +79,7 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate, { long mfi, mfn, mfd, pdf, ref_clk; unsigned long dbl; - s64 temp; + u64 temp; dbl = dp_ctl & MXC_PLL_DP_CTL_DPDCK0_2_EN; @@ -98,8 +98,9 @@ static unsigned long __clk_pllv2_recalc_rate(unsigned long parent_rate, temp = (u64) ref_clk * abs(mfn); do_div(temp, mfd + 1); if (mfn < 0) - temp = -temp; - temp = (ref_clk * mfi) + temp; + temp = (ref_clk * mfi) - temp; + else + temp = (ref_clk * mfi) + temp; return temp; } @@ -126,7 +127,7 @@ static int __clk_pllv2_set_rate(unsigned long rate, unsigned long parent_rate, { u32 reg; long mfi, pdf, mfn, mfd = 999999; - s64 temp64; + u64 temp64; unsigned long quad_parent_rate; quad_parent_rate = 4 * parent_rate; -- cgit v0.10.2 From 3ed9c824375b07a2acd6919d0fe7aa71650c7f69 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 21:49:15 -0500 Subject: tegra/clk-divider: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/clk/tegra/clk-divider.c b/drivers/clk/tegra/clk-divider.c index 48c83ef..16e0aee 100644 --- a/drivers/clk/tegra/clk-divider.c +++ b/drivers/clk/tegra/clk-divider.c @@ -32,7 +32,7 @@ static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate, unsigned long parent_rate) { - s64 divider_ux1 = parent_rate; + u64 divider_ux1 = parent_rate; u8 flags = divider->flags; int mul; @@ -54,7 +54,7 @@ static int get_div(struct tegra_clk_frac_div *divider, unsigned long rate, divider_ux1 -= mul; - if (divider_ux1 < 0) + if ((s64)divider_ux1 < 0) return 0; if (divider_ux1 > get_max_div(divider)) -- cgit v0.10.2 From e55791883f0a18734ac3d32eddf297b1610e248d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:09:58 -0500 Subject: ti/clkt_dpll: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c index 9023ca9..b5cc6f6 100644 --- a/drivers/clk/ti/clkt_dpll.c +++ b/drivers/clk/ti/clkt_dpll.c @@ -240,7 +240,7 @@ u8 omap2_init_dpll_parent(struct clk_hw *hw) */ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) { - long long dpll_clk; + u64 dpll_clk; u32 dpll_mult, dpll_div, v; struct dpll_data *dd; @@ -262,7 +262,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) dpll_div = v & dd->div1_mask; dpll_div >>= __ffs(dd->div1_mask); - dpll_clk = (long long)clk_get_rate(dd->clk_ref) * dpll_mult; + dpll_clk = (u64)clk_get_rate(dd->clk_ref) * dpll_mult; do_div(dpll_clk, dpll_div + 1); return dpll_clk; -- cgit v0.10.2 From 3db14288bcc1ef8b53c1ceef35d998530b630657 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:17:11 -0500 Subject: ti/fapll: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/clk/ti/fapll.c b/drivers/clk/ti/fapll.c index f4b2e98..66a0d0e 100644 --- a/drivers/clk/ti/fapll.c +++ b/drivers/clk/ti/fapll.c @@ -168,7 +168,7 @@ static unsigned long ti_fapll_recalc_rate(struct clk_hw *hw, { struct fapll_data *fd = to_fapll(hw); u32 fapll_n, fapll_p, v; - long long rate; + u64 rate; if (ti_fapll_clock_is_bypass(fd)) return parent_rate; @@ -314,7 +314,7 @@ static unsigned long ti_fapll_synth_recalc_rate(struct clk_hw *hw, { struct fapll_synth *synth = to_synth(hw); u32 synth_div_m; - long long rate; + u64 rate; /* The audio_pll_clk1 is hardwired to produce 32.768KiHz clock */ if (!synth->div) -- cgit v0.10.2 From 8d43b49e7e0070f96ac46d30659a336c0224fa0b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 17:01:46 -0500 Subject: hid-sensor-hub.c: fix wrong do_div() usage do_div() must only be used with a u64 dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index 92870cd..8efaa88 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -218,7 +218,8 @@ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id, goto done_proc; } - remaining_bytes = do_div(buffer_size, sizeof(__s32)); + remaining_bytes = buffer_size % sizeof(__s32); + buffer_size = buffer_size / sizeof(__s32); if (buffer_size) { for (i = 0; i < buffer_size; ++i) { hid_set_field(report->field[field_index], i, -- cgit v0.10.2 From c24ca5be763a402c7fce0bd073c17de901ad5e38 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:09:58 -0500 Subject: drm/mgag200/mgag200_mode.c: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c index c99d3fe..1d4c480 100644 --- a/drivers/gpu/drm/mgag200/mgag200_mode.c +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c @@ -1564,7 +1564,7 @@ static uint32_t mga_vga_calculate_mode_bandwidth(struct drm_display_mode *mode, int bits_per_pixel) { uint32_t total_area, divisor; - int64_t active_area, pixels_per_second, bandwidth; + uint64_t active_area, pixels_per_second, bandwidth; uint64_t bytes_per_pixel = (bits_per_pixel + 7) / 8; divisor = 1024; -- cgit v0.10.2 From 1c07db46511f0d2335d3b32008f644164071d13e Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 3 Nov 2015 23:09:58 -0500 Subject: mtd/sm_ftl.c: fix wrong do_div() usage do_div() is meant to be used with an unsigned dividend. Signed-off-by: Nicolas Pitre diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c index c23184a..b096f8b 100644 --- a/drivers/mtd/sm_ftl.c +++ b/drivers/mtd/sm_ftl.c @@ -206,9 +206,10 @@ static loff_t sm_mkoffset(struct sm_ftl *ftl, int zone, int block, int boffset) } /* Breaks offset into parts */ -static void sm_break_offset(struct sm_ftl *ftl, loff_t offset, +static void sm_break_offset(struct sm_ftl *ftl, loff_t loffset, int *zone, int *block, int *boffset) { + u64 offset = loffset; *boffset = do_div(offset, ftl->block_size); *block = do_div(offset, ftl->max_lba); *zone = offset >= ftl->zone_count ? -1 : offset; -- cgit v0.10.2 From 911918aa7ef6f868c135505b0015e42072c54682 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 2 Nov 2015 14:55:05 -0500 Subject: div64.h: optimize do_div() for power-of-two constant divisors Let's perform the obvious mask and shift operation in this case. On 32-bit targets, gcc is able to do the same thing with a constant divisor that happens to be a power of two i.e. it turns the division into an inline shift, but it doesn't hurt to be explicit. Signed-off-by: Nicolas Pitre diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 8f4e319..5d97468 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -32,6 +32,8 @@ #elif BITS_PER_LONG == 32 +#include + extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); /* The unnecessary pointer compare is there @@ -41,7 +43,11 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); uint32_t __base = (base); \ uint32_t __rem; \ (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ - if (likely(((n) >> 32) == 0)) { \ + if (__builtin_constant_p(__base) && \ + is_power_of_2(__base)) { \ + __rem = (n) & (__base - 1); \ + (n) >>= ilog2(__base); \ + } else if (likely(((n) >> 32) == 0)) { \ __rem = (uint32_t)(n) % __base; \ (n) = (uint32_t)(n) / __base; \ } else \ -- cgit v0.10.2 From 461a5e51060c93f5844113f4be9dba513cc92830 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 30 Oct 2015 15:36:39 -0400 Subject: do_div(): generic optimization for constant divisor on 32-bit machines 64-by-32-bit divisions are prominent in the kernel, even on 32-bit machines. Luckily, many of them use a constant divisor that allows for a much faster multiplication by the divisor's reciprocal. The compiler already performs this optimization when compiling a 32-by-32 division with a constant divisor. Unfortunately, on 32-bit machines, gcc does not optimize 64-by-32 divisions in that case, except for constant divisors that happen to be a power of 2. Let's avoid the slow path whenever the divisor is constant by manually computing the reciprocal ourselves and performing the multiplication inline. In most cases, this improves performance of 64-by-32 divisions by about two orders of magnitude compared to the __div64_32() fallback, especially on architectures lacking a native div instruction. The algorithm used here comes from the existing ARM code. The __div64_const32_is_OK macro can be predefined by architectures to disable this optimization in some cases. For example, some ancient gcc version on ARM would crash with an ICE when fed this code. Signed-off-by: Nicolas Pitre Acked-by: Alexey Brodkin diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 5d97468..5a1bf1a 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -4,6 +4,9 @@ * Copyright (C) 2003 Bernardo Innocenti * Based on former asm-ppc/div64.h and asm-m68knommu/div64.h * + * Optimization for constant divisors on 32-bit machines: + * Copyright (C) 2006-2015 Nicolas Pitre + * * The semantics of do_div() are: * * uint32_t do_div(uint64_t *n, uint32_t base) @@ -34,6 +37,142 @@ #include +/* + * If the divisor happens to be constant, we determine the appropriate + * inverse at compile time to turn the division into a few inline + * multiplications which ought to be much faster. And yet only if compiling + * with a sufficiently recent gcc version to perform proper 64-bit constant + * propagation. + * + * (It is unfortunate that gcc doesn't perform all this internally.) + */ + +#ifndef __div64_const32_is_OK +#define __div64_const32_is_OK (__GNUC__ >= 4) +#endif + +#define __div64_const32(n, ___b) \ +({ \ + /* \ + * Multiplication by reciprocal of b: n / b = n * (p / b) / p \ + * \ + * We rely on the fact that most of this code gets optimized \ + * away at compile time due to constant propagation and only \ + * a few multiplication instructions should remain. \ + * Hence this monstrous macro (static inline doesn't always \ + * do the trick here). \ + */ \ + uint64_t ___res, ___x, ___t, ___m, ___n = (n); \ + uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi; \ + \ + /* determine MSB of b */ \ + ___p = 1 << ilog2(___b); \ + \ + /* compute m = ((p << 64) + b - 1) / b */ \ + ___m = (~0ULL / ___b) * ___p; \ + ___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b; \ + \ + /* one less than the dividend with highest result */ \ + ___x = ~0ULL / ___b * ___b - 1; \ + \ + /* test our ___m with res = m * x / (p << 64) */ \ + ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \ + ___t = ___res += (___m & 0xffffffff) * (___x >> 32); \ + ___res += (___x & 0xffffffff) * (___m >> 32); \ + ___t = (___res < ___t) ? (1ULL << 32) : 0; \ + ___res = (___res >> 32) + ___t; \ + ___res += (___m >> 32) * (___x >> 32); \ + ___res /= ___p; \ + \ + /* Now sanitize and optimize what we've got. */ \ + if (~0ULL % (___b / (___b & -___b)) == 0) { \ + /* special case, can be simplified to ... */ \ + ___n /= (___b & -___b); \ + ___m = ~0ULL / (___b / (___b & -___b)); \ + ___p = 1; \ + ___bias = 1; \ + } else if (___res != ___x / ___b) { \ + /* \ + * We can't get away without a bias to compensate \ + * for bit truncation errors. To avoid it we'd need an \ + * additional bit to represent m which would overflow \ + * a 64-bit variable. \ + * \ + * Instead we do m = p / b and n / b = (n * m + m) / p. \ + */ \ + ___bias = 1; \ + /* Compute m = (p << 64) / b */ \ + ___m = (~0ULL / ___b) * ___p; \ + ___m += ((~0ULL % ___b + 1) * ___p) / ___b; \ + } else { \ + /* \ + * Reduce m / p, and try to clear bit 31 of m when \ + * possible, otherwise that'll need extra overflow \ + * handling later. \ + */ \ + uint32_t ___bits = -(___m & -___m); \ + ___bits |= ___m >> 32; \ + ___bits = (~___bits) << 1; \ + /* \ + * If ___bits == 0 then setting bit 31 is unavoidable. \ + * Simply apply the maximum possible reduction in that \ + * case. Otherwise the MSB of ___bits indicates the \ + * best reduction we should apply. \ + */ \ + if (!___bits) { \ + ___p /= (___m & -___m); \ + ___m /= (___m & -___m); \ + } else { \ + ___p >>= ilog2(___bits); \ + ___m >>= ilog2(___bits); \ + } \ + /* No bias needed. */ \ + ___bias = 0; \ + } \ + \ + /* \ + * Now we have a combination of 2 conditions: \ + * \ + * 1) whether or not we need to apply a bias, and \ + * \ + * 2) whether or not there might be an overflow in the cross \ + * product determined by (___m & ((1 << 63) | (1 << 31))). \ + * \ + * Select the best way to do (m_bias + m * n) / (p << 64). \ + * From now on there will be actual runtime code generated. \ + */ \ + \ + ___m_lo = ___m; \ + ___m_hi = ___m >> 32; \ + ___n_lo = ___n; \ + ___n_hi = ___n >> 32; \ + \ + if (!___bias) { \ + ___res = ((uint64_t)___m_lo * ___n_lo) >> 32; \ + } else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \ + ___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32; \ + } else { \ + ___res = ___m + (uint64_t)___m_lo * ___n_lo; \ + ___t = (___res < ___m) ? (1ULL << 32) : 0; \ + ___res = (___res >> 32) + ___t; \ + } \ + \ + if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \ + ___res += (uint64_t)___m_lo * ___n_hi; \ + ___res += (uint64_t)___m_hi * ___n_lo; \ + ___res >>= 32; \ + } else { \ + ___t = ___res += (uint64_t)___m_lo * ___n_hi; \ + ___res += (uint64_t)___m_hi * ___n_lo; \ + ___t = (___res < ___t) ? (1ULL << 32) : 0; \ + ___res = (___res >> 32) + ___t; \ + } \ + \ + ___res += (uint64_t)___m_hi * ___n_hi; \ + \ + ___res /= ___p; \ +}) + extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); /* The unnecessary pointer compare is there @@ -47,6 +186,14 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); is_power_of_2(__base)) { \ __rem = (n) & (__base - 1); \ (n) >>= ilog2(__base); \ + } else if (__div64_const32_is_OK && \ + __builtin_constant_p(__base) && \ + __base != 0) { \ + uint32_t __res_lo, __n_lo = (n); \ + (n) = __div64_const32(n, __base); \ + /* the remainder can be computed with 32-bit regs */ \ + __res_lo = (n); \ + __rem = __n_lo - __res_lo * __base; \ } else if (likely(((n) >> 32) == 0)) { \ __rem = (uint32_t)(n) % __base; \ (n) = (uint32_t)(n) / __base; \ -- cgit v0.10.2 From f682b27c57aec2f0ca8927f9bb7c267c6165ad5a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 30 Oct 2015 17:54:56 -0400 Subject: __div64_const32(): abstract out the actual 128-bit cross product code The default C implementation for the 128-bit cross product is abstracted into the __arch_xprod_64() macro that can be overridden to let architectures provide their own assembly optimized implementation. There are many advantages to an assembly version for this operation. Carry bit handling becomes trivial, and 32-bit shifts may be achieved simply by inverting register pairs on some architectures. This has the potential to be quite faster and use much fewer instructions. Signed-off-by: Nicolas Pitre diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 5a1bf1a..408856a 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -63,7 +63,7 @@ * do the trick here). \ */ \ uint64_t ___res, ___x, ___t, ___m, ___n = (n); \ - uint32_t ___p, ___bias, ___m_lo, ___m_hi, ___n_lo, ___n_hi; \ + uint32_t ___p, ___bias; \ \ /* determine MSB of b */ \ ___p = 1 << ilog2(___b); \ @@ -138,41 +138,62 @@ * 2) whether or not there might be an overflow in the cross \ * product determined by (___m & ((1 << 63) | (1 << 31))). \ * \ - * Select the best way to do (m_bias + m * n) / (p << 64). \ + * Select the best way to do (m_bias + m * n) / (1 << 64). \ * From now on there will be actual runtime code generated. \ */ \ - \ - ___m_lo = ___m; \ - ___m_hi = ___m >> 32; \ - ___n_lo = ___n; \ - ___n_hi = ___n >> 32; \ - \ - if (!___bias) { \ - ___res = ((uint64_t)___m_lo * ___n_lo) >> 32; \ - } else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \ - ___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32; \ - } else { \ - ___res = ___m + (uint64_t)___m_lo * ___n_lo; \ - ___t = (___res < ___m) ? (1ULL << 32) : 0; \ - ___res = (___res >> 32) + ___t; \ - } \ - \ - if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \ - ___res += (uint64_t)___m_lo * ___n_hi; \ - ___res += (uint64_t)___m_hi * ___n_lo; \ - ___res >>= 32; \ - } else { \ - ___t = ___res += (uint64_t)___m_lo * ___n_hi; \ - ___res += (uint64_t)___m_hi * ___n_lo; \ - ___t = (___res < ___t) ? (1ULL << 32) : 0; \ - ___res = (___res >> 32) + ___t; \ - } \ - \ - ___res += (uint64_t)___m_hi * ___n_hi; \ + ___res = __arch_xprod_64(___m, ___n, ___bias); \ \ ___res /= ___p; \ }) +#ifndef __arch_xprod_64 +/* + * Default C implementation for __arch_xprod_64() + * + * Prototype: uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias) + * Semantic: retval = ((bias ? m : 0) + m * n) >> 64 + * + * The product is a 128-bit value, scaled down to 64 bits. + * Assuming constant propagation to optimize away unused conditional code. + * Architectures may provide their own optimized assembly implementation. + */ +static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias) +{ + uint32_t m_lo = m; + uint32_t m_hi = m >> 32; + uint32_t n_lo = n; + uint32_t n_hi = n >> 32; + uint64_t res, tmp; + + if (!bias) { + res = ((uint64_t)m_lo * n_lo) >> 32; + } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) { + /* there can't be any overflow here */ + res = (m + (uint64_t)m_lo * n_lo) >> 32; + } else { + res = m + (uint64_t)m_lo * n_lo; + tmp = (res < m) ? (1ULL << 32) : 0; + res = (res >> 32) + tmp; + } + + if (!(m & ((1ULL << 63) | (1ULL << 31)))) { + /* there can't be any overflow here */ + res += (uint64_t)m_lo * n_hi; + res += (uint64_t)m_hi * n_lo; + res >>= 32; + } else { + tmp = res += (uint64_t)m_lo * n_hi; + res += (uint64_t)m_hi * n_lo; + tmp = (res < tmp) ? (1ULL << 32) : 0; + res = (res >> 32) + tmp; + } + + res += (uint64_t)m_hi * n_hi; + + return res; +} +#endif + extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); /* The unnecessary pointer compare is there -- cgit v0.10.2 From dce1eb93b19b2a1a441708f51c97c4a554054d00 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 2 Nov 2015 13:02:46 -0500 Subject: __div64_32(): make it overridable at compile time Some architectures may want to override the default implementation at compile time to do things inline. For example, ARM uses a non-standard calling convention for better efficiency in this case. Signed-off-by: Nicolas Pitre diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h index 408856a..163f779 100644 --- a/include/asm-generic/div64.h +++ b/include/asm-generic/div64.h @@ -194,7 +194,9 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias) } #endif +#ifndef __div64_32 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); +#endif /* The unnecessary pointer compare is there * to check for type safety (n must be 64bit) diff --git a/lib/div64.c b/lib/div64.c index 62a698a..7f34525 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -13,7 +13,8 @@ * * Code generated for this function might be very inefficient * for some CPUs. __div64_32() can be overridden by linking arch-specific - * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S. + * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S + * or by defining a preprocessor macro in arch/include/asm/div64.h. */ #include @@ -23,6 +24,7 @@ /* Not needed on 64bit architectures */ #if BITS_PER_LONG == 32 +#ifndef __div64_32 uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base) { uint64_t rem = *n; @@ -55,8 +57,8 @@ uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base) *n = res; return rem; } - EXPORT_SYMBOL(__div64_32); +#endif #ifndef div_s64_rem s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) -- cgit v0.10.2 From 040b323b5012b5503561ec7fe15cccd6a4bcaec2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 2 Nov 2015 14:20:41 -0500 Subject: ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h index 662c7bd..e1f0776 100644 --- a/arch/arm/include/asm/div64.h +++ b/arch/arm/include/asm/div64.h @@ -5,9 +5,9 @@ #include /* - * The semantics of do_div() are: + * The semantics of __div64_32() are: * - * uint32_t do_div(uint64_t *n, uint32_t base) + * uint32_t __div64_32(uint64_t *n, uint32_t base) * { * uint32_t remainder = *n % base; * *n = *n / base; @@ -16,8 +16,9 @@ * * In other words, a 64-bit dividend with a 32-bit divisor producing * a 64-bit result and a 32-bit remainder. To accomplish this optimally - * we call a special __do_div64 helper with completely non standard - * calling convention for arguments and results (beware). + * we override the generic version in lib/div64.c to call our __do_div64 + * assembly implementation with completely non standard calling convention + * for arguments and results (beware). */ #ifdef __ARMEB__ @@ -28,199 +29,101 @@ #define __xh "r1" #endif -#define __do_div_asm(n, base) \ -({ \ - register unsigned int __base asm("r4") = base; \ - register unsigned long long __n asm("r0") = n; \ - register unsigned long long __res asm("r2"); \ - register unsigned int __rem asm(__xh); \ - asm( __asmeq("%0", __xh) \ - __asmeq("%1", "r2") \ - __asmeq("%2", "r0") \ - __asmeq("%3", "r4") \ - "bl __do_div64" \ - : "=r" (__rem), "=r" (__res) \ - : "r" (__n), "r" (__base) \ - : "ip", "lr", "cc"); \ - n = __res; \ - __rem; \ -}) - -#if __GNUC__ < 4 || !defined(CONFIG_AEABI) +static inline uint32_t __div64_32(uint64_t *n, uint32_t base) +{ + register unsigned int __base asm("r4") = base; + register unsigned long long __n asm("r0") = *n; + register unsigned long long __res asm("r2"); + register unsigned int __rem asm(__xh); + asm( __asmeq("%0", __xh) + __asmeq("%1", "r2") + __asmeq("%2", "r0") + __asmeq("%3", "r4") + "bl __do_div64" + : "=r" (__rem), "=r" (__res) + : "r" (__n), "r" (__base) + : "ip", "lr", "cc"); + *n = __res; + return __rem; +} +#define __div64_32 __div64_32 + +#if !defined(CONFIG_AEABI) /* - * gcc versions earlier than 4.0 are simply too problematic for the - * optimized implementation below. First there is gcc PR 15089 that - * tend to trig on more complex constructs, spurious .global __udivsi3 - * are inserted even if none of those symbols are referenced in the - * generated code, and those gcc versions are not able to do constant - * propagation on long long values anyway. + * In OABI configurations, some uses of the do_div function + * cause gcc to run out of registers. To work around that, + * we can force the use of the out-of-line version for + * configurations that build a OABI kernel. */ -#define do_div(n, base) __do_div_asm(n, base) - -#elif __GNUC__ >= 4 +#define do_div(n, base) __div64_32(&(n), base) -#include +#else /* - * If the divisor happens to be constant, we determine the appropriate - * inverse at compile time to turn the division into a few inline - * multiplications instead which is much faster. And yet only if compiling - * for ARMv4 or higher (we need umull/umlal) and if the gcc version is - * sufficiently recent to perform proper long long constant propagation. - * (It is unfortunate that gcc doesn't perform all this internally.) + * gcc versions earlier than 4.0 are simply too problematic for the + * __div64_const32() code in asm-generic/div64.h. First there is + * gcc PR 15089 that tend to trig on more complex constructs, spurious + * .global __udivsi3 are inserted even if none of those symbols are + * referenced in the generated code, and those gcc versions are not able + * to do constant propagation on long long values anyway. */ -#define do_div(n, base) \ -({ \ - unsigned int __r, __b = (base); \ - if (!__builtin_constant_p(__b) || __b == 0 || \ - (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \ - /* non-constant divisor (or zero): slow path */ \ - __r = __do_div_asm(n, __b); \ - } else if ((__b & (__b - 1)) == 0) { \ - /* Trivial: __b is constant and a power of 2 */ \ - /* gcc does the right thing with this code. */ \ - __r = n; \ - __r &= (__b - 1); \ - n /= __b; \ - } else { \ - /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ - /* We rely on the fact that most of this code gets */ \ - /* optimized away at compile time due to constant */ \ - /* propagation and only a couple inline assembly */ \ - /* instructions should remain. Better avoid any */ \ - /* code construct that might prevent that. */ \ - unsigned long long __res, __x, __t, __m, __n = n; \ - unsigned int __c, __p, __z = 0; \ - /* preserve low part of n for reminder computation */ \ - __r = __n; \ - /* determine number of bits to represent __b */ \ - __p = 1 << __div64_fls(__b); \ - /* compute __m = ((__p << 64) + __b - 1) / __b */ \ - __m = (~0ULL / __b) * __p; \ - __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ - /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ - __x = ~0ULL / __b * __b - 1; \ - __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ - __res >>= 32; \ - __res += (__m & 0xffffffff) * (__x >> 32); \ - __t = __res; \ - __res += (__x & 0xffffffff) * (__m >> 32); \ - __t = (__res < __t) ? (1ULL << 32) : 0; \ - __res = (__res >> 32) + __t; \ - __res += (__m >> 32) * (__x >> 32); \ - __res /= __p; \ - /* Now sanitize and optimize what we've got. */ \ - if (~0ULL % (__b / (__b & -__b)) == 0) { \ - /* those cases can be simplified with: */ \ - __n /= (__b & -__b); \ - __m = ~0ULL / (__b / (__b & -__b)); \ - __p = 1; \ - __c = 1; \ - } else if (__res != __x / __b) { \ - /* We can't get away without a correction */ \ - /* to compensate for bit truncation errors. */ \ - /* To avoid it we'd need an additional bit */ \ - /* to represent __m which would overflow it. */ \ - /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ - __c = 1; \ - /* Compute __m = (__p << 64) / __b */ \ - __m = (~0ULL / __b) * __p; \ - __m += ((~0ULL % __b + 1) * __p) / __b; \ - } else { \ - /* Reduce __m/__p, and try to clear bit 31 */ \ - /* of __m when possible otherwise that'll */ \ - /* need extra overflow handling later. */ \ - unsigned int __bits = -(__m & -__m); \ - __bits |= __m >> 32; \ - __bits = (~__bits) << 1; \ - /* If __bits == 0 then setting bit 31 is */ \ - /* unavoidable. Simply apply the maximum */ \ - /* possible reduction in that case. */ \ - /* Otherwise the MSB of __bits indicates the */ \ - /* best reduction we should apply. */ \ - if (!__bits) { \ - __p /= (__m & -__m); \ - __m /= (__m & -__m); \ - } else { \ - __p >>= __div64_fls(__bits); \ - __m >>= __div64_fls(__bits); \ - } \ - /* No correction needed. */ \ - __c = 0; \ - } \ - /* Now we have a combination of 2 conditions: */ \ - /* 1) whether or not we need a correction (__c), and */ \ - /* 2) whether or not there might be an overflow in */ \ - /* the cross product (__m & ((1<<63) | (1<<31))) */ \ - /* Select the best insn combination to perform the */ \ - /* actual __m * __n / (__p << 64) operation. */ \ - if (!__c) { \ - asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ - "mov %Q0, #0" \ - : "=&r" (__res) \ - : "r" (__m), "r" (__n) \ - : "cc" ); \ - } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ - __res = __m; \ - asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \ - "mov %Q0, #0" \ - : "+&r" (__res) \ - : "r" (__m), "r" (__n) \ - : "cc" ); \ - } else { \ - asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ - "cmn %Q0, %Q1\n\t" \ - "adcs %R0, %R0, %R1\n\t" \ - "adc %Q0, %3, #0" \ - : "=&r" (__res) \ - : "r" (__m), "r" (__n), "r" (__z) \ - : "cc" ); \ - } \ - if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ - asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \ - "umlal %R0, %Q0, %Q1, %R2\n\t" \ - "mov %R0, #0\n\t" \ - "umlal %Q0, %R0, %R1, %R2" \ - : "+&r" (__res) \ - : "r" (__m), "r" (__n) \ - : "cc" ); \ - } else { \ - asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \ - "umlal %R0, %1, %Q2, %R3\n\t" \ - "mov %R0, #0\n\t" \ - "adds %Q0, %1, %Q0\n\t" \ - "adc %R0, %R0, #0\n\t" \ - "umlal %Q0, %R0, %R2, %R3" \ - : "+&r" (__res), "+&r" (__z) \ - : "r" (__m), "r" (__n) \ - : "cc" ); \ - } \ - __res /= __p; \ - /* The reminder can be computed with 32-bit regs */ \ - /* only, and gcc is good at that. */ \ - { \ - unsigned int __res0 = __res; \ - unsigned int __b0 = __b; \ - __r -= __res0 * __b0; \ - } \ - /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \ - n = __res; \ - } \ - __r; \ -}) - -/* our own fls implementation to make sure constant propagation is fine */ -#define __div64_fls(bits) \ -({ \ - unsigned int __left = (bits), __nr = 0; \ - if (__left & 0xffff0000) __nr += 16, __left >>= 16; \ - if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \ - if (__left & 0x000000f0) __nr += 4, __left >>= 4; \ - if (__left & 0x0000000c) __nr += 2, __left >>= 2; \ - if (__left & 0x00000002) __nr += 1; \ - __nr; \ -}) + +#define __div64_const32_is_OK (__GNUC__ >= 4) + +static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias) +{ + unsigned long long res; + unsigned int tmp = 0; + + if (!bias) { + asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" + "mov %Q0, #0" + : "=&r" (res) + : "r" (m), "r" (n) + : "cc"); + } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) { + res = m; + asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" + "mov %Q0, #0" + : "+&r" (res) + : "r" (m), "r" (n) + : "cc"); + } else { + asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" + "cmn %Q0, %Q1\n\t" + "adcs %R0, %R0, %R1\n\t" + "adc %Q0, %3, #0" + : "=&r" (res) + : "r" (m), "r" (n), "r" (tmp) + : "cc"); + } + + if (!(m & ((1ULL << 63) | (1ULL << 31)))) { + asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" + "umlal %R0, %Q0, %Q1, %R2\n\t" + "mov %R0, #0\n\t" + "umlal %Q0, %R0, %R1, %R2" + : "+&r" (res) + : "r" (m), "r" (n) + : "cc"); + } else { + asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" + "umlal %R0, %1, %Q2, %R3\n\t" + "mov %R0, #0\n\t" + "adds %Q0, %1, %Q0\n\t" + "adc %R0, %R0, #0\n\t" + "umlal %Q0, %R0, %R2, %R3" + : "+&r" (res), "+&r" (tmp) + : "r" (m), "r" (n) + : "cc"); + } + + return res; +} +#define __arch_xprod_64 __arch_xprod_64 + +#include #endif -- cgit v0.10.2