From f4bcbe792b8f434e32487cff9d9e30ab45a3ce02 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Fri, 20 May 2016 07:26:00 -0400 Subject: Pull out string hash to ... so they can be used without the rest of The hashlen_* macros will make sense next patch. Signed-off-by: George Spelvin diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7e9422c..0f9a977 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -10,6 +10,7 @@ #include #include #include +#include struct path; struct vfsmount; @@ -52,9 +53,6 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } -#define hashlen_hash(hashlen) ((u32) (hashlen)) -#define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) -#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash)) struct dentry_stat_t { long nr_dentry; @@ -65,29 +63,6 @@ struct dentry_stat_t { }; extern struct dentry_stat_t dentry_stat; -/* Name hashing routines. Initial hash value */ -/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ -#define init_name_hash() 0 - -/* partial hash update function. Assume roughly 4 bits per character */ -static inline unsigned long -partial_name_hash(unsigned long c, unsigned long prevhash) -{ - return (prevhash + (c << 4) + (c >> 4)) * 11; -} - -/* - * Finally: cut down the number of bits to a int value (and try to avoid - * losing bits) - */ -static inline unsigned long end_name_hash(unsigned long hash) -{ - return (unsigned int) hash; -} - -/* Compute the hash for a name string. */ -extern unsigned int full_name_hash(const unsigned char *, unsigned int); - /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h new file mode 100644 index 0000000..2eaaaf6 --- /dev/null +++ b/include/linux/stringhash.h @@ -0,0 +1,72 @@ +#ifndef __LINUX_STRINGHASH_H +#define __LINUX_STRINGHASH_H + +#include + +/* + * Routines for hashing strings of bytes to a 32-bit hash value. + * + * These hash functions are NOT GUARANTEED STABLE between kernel + * versions, architectures, or even repeated boots of the same kernel. + * (E.g. they may depend on boot-time hardware detection or be + * deliberately randomized.) + * + * They are also not intended to be secure against collisions caused by + * malicious inputs; much slower hash functions are required for that. + * + * They are optimized for pathname components, meaning short strings. + * Even if a majority of files have longer names, the dynamic profile of + * pathname components skews short due to short directory names. + * (E.g. /usr/lib/libsesquipedalianism.so.3.141.) + */ + +/* + * Version 1: one byte at a time. Example of use: + * + * unsigned long hash = init_name_hash; + * while (*p) + * hash = partial_name_hash(tolower(*p++), hash); + * hash = end_name_hash(hash); + * + * Although this is designed for bytes, fs/hfsplus/unicode.c + * abuses it to hash 16-bit values. + */ + +/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ +#define init_name_hash() 0 + +/* partial hash update function. Assume roughly 4 bits per character */ +static inline unsigned long +partial_name_hash(unsigned long c, unsigned long prevhash) +{ + return (prevhash + (c << 4) + (c >> 4)) * 11; +} + +/* + * Finally: cut down the number of bits to a int value (and try to avoid + * losing bits) + */ +static inline unsigned long end_name_hash(unsigned long hash) +{ + return (unsigned int)hash; +} + +/* + * Version 2: One word (32 or 64 bits) at a time. + * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning + * exists, which describes major Linux platforms like x86 and ARM), then + * this computes a different hash function much faster. + * + * If not set, this falls back to a wrapper around the preceding. + */ +extern unsigned int full_name_hash(const unsigned char *, unsigned int); + +/* + * A hash_len is a u64 with the hash of a string in the low + * half and the length in the high half. + */ +#define hashlen_hash(hashlen) ((u32)(hashlen)) +#define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) +#define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) + +#endif /* __LINUX_STRINGHASH_H */ -- cgit v0.10.2 From fcfd2fbf22d2587196890103d41e3d554c47da0e Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Fri, 20 May 2016 08:41:37 -0400 Subject: fs/namei.c: Add hashlen_string() function We'd like to make more use of the highly-optimized dcache hash functions throughout the kernel, rather than have every subsystem create its own, and a function that hashes basic null-terminated strings is required for that. (The name is to emphasize that it returns both hash and length.) It's actually useful in the dcache itself, specifically d_alloc_name(). Other uses in the next patch. full_name_hash() is also tweaked to make it more generally useful: 1) Take a "char *" rather than "unsigned char *" argument, to be consistent with hash_name(). 2) Handle zero-length inputs. If we want more callers, we don't want to make them worry about corner cases. Signed-off-by: George Spelvin diff --git a/fs/dcache.c b/fs/dcache.c index d5ecc6e..19b7518 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1653,8 +1653,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) struct qstr q; q.name = name; - q.len = strlen(name); - q.hash = full_name_hash(q.name, q.len); + q.hash_len = hashlen_string(name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); diff --git a/fs/namei.c b/fs/namei.c index 42f8ca0..dd98d43 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1822,19 +1822,20 @@ static inline unsigned long mix_hash(unsigned long hash) #endif -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* Return the hash of a string of known length */ +unsigned int full_name_hash(const char *name, unsigned int len) { unsigned long a, hash = 0; for (;;) { + if (!len) + goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; hash = mix_hash(hash + a); name += sizeof(unsigned long); len -= sizeof(unsigned long); - if (!len) - goto done; } hash += a & bytemask_from_count(len); done: @@ -1842,6 +1843,29 @@ done: } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const char *name) +{ + unsigned long a, adata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + + hash = a = 0; + len = -sizeof(unsigned long); + do { + hash = mix_hash(hash + a); + len += sizeof(unsigned long); + a = load_unaligned_zeropad(name+len); + } while (!has_zero(a, &adata, &constants)); + + adata = prep_zero_mask(a, adata, &constants); + mask = create_zero_mask(adata); + hash += a & zero_bytemask(mask); + len += find_zero(mask); + + return hashlen_create(fold_hash(hash), len); +} +EXPORT_SYMBOL(hashlen_string); + /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. @@ -1872,15 +1896,32 @@ static inline u64 hash_name(const char *name) #else -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* Return the hash of a string of known length */ +unsigned int full_name_hash(const char *name, unsigned int len) { unsigned long hash = init_name_hash(); while (len--) - hash = partial_name_hash(*name++, hash); + hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hash_string(const char *name) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + do { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } while (c); + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(hash_string); + /* * We know there's a real path component here of at least * one character. diff --git a/include/linux/stringhash.h b/include/linux/stringhash.h index 2eaaaf6..451771d 100644 --- a/include/linux/stringhash.h +++ b/include/linux/stringhash.h @@ -1,7 +1,8 @@ #ifndef __LINUX_STRINGHASH_H #define __LINUX_STRINGHASH_H -#include +#include /* For __pure */ +#include /* For u32, u64 */ /* * Routines for hashing strings of bytes to a 32-bit hash value. @@ -59,7 +60,7 @@ static inline unsigned long end_name_hash(unsigned long hash) * * If not set, this falls back to a wrapper around the preceding. */ -extern unsigned int full_name_hash(const unsigned char *, unsigned int); +extern unsigned int __pure full_name_hash(const char *, unsigned int); /* * A hash_len is a u64 with the hash of a string in the low @@ -69,4 +70,7 @@ extern unsigned int full_name_hash(const unsigned char *, unsigned int); #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) #define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) +/* Return the "hash_len" (hash and length) of a null-terminated string */ +extern u64 __pure hashlen_string(const char *name); + #endif /* __LINUX_STRINGHASH_H */ -- cgit v0.10.2 From 917ea166f4672ec085f2cccc135c7c0eec72282c Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Fri, 20 May 2016 13:31:33 -0400 Subject: : Define hash_str() in terms of hashlen_string() Finally, the first use of previous two patches: eliminate the separate ad-hoc string hash functions in the sunrpc code. Now hash_str() is a wrapper around hash_string(), and hash_mem() is likewise a wrapper around full_name_hash(). Note that sunrpc code *does* call hash_mem() with a zero length, which is why the previous patch needed to handle that in full_name_hash(). (Thanks, Bruce, for finding that!) This also eliminates the only caller of hash_long which asks for more than 32 bits of output. The comment about the quality of hashlen_string() and full_name_hash() is jumping the gun by a few patches; they aren't very impressive now, but will be improved greatly later in the series. Signed-off-by: George Spelvin Tested-by: J. Bruce Fields Acked-by: J. Bruce Fields Cc: Jeff Layton Cc: linux-nfs@vger.kernel.org diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index c00f53a..91d5a5d 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -16,6 +16,7 @@ #include #include #include +#include #include struct svc_cred { @@ -165,41 +166,18 @@ extern int svcauth_unix_set_client(struct svc_rqst *rqstp); extern int unix_gid_cache_create(struct net *net); extern void unix_gid_cache_destroy(struct net *net); -static inline unsigned long hash_str(char *name, int bits) +/* + * The functions are good enough that we don't need to + * use hash_32() on them; just extracting the high bits is enough. + */ +static inline unsigned long hash_str(char const *name, int bits) { - unsigned long hash = 0; - unsigned long l = 0; - int len = 0; - unsigned char c; - do { - if (unlikely(!(c = *name++))) { - c = (char)len; len = -1; - } - l = (l << 8) | c; - len++; - if ((len & (BITS_PER_LONG/8-1))==0) - hash = hash_long(hash^l, BITS_PER_LONG); - } while (len); - return hash >> (BITS_PER_LONG - bits); + return hashlen_hash(hashlen_string(name)) >> (32 - bits); } -static inline unsigned long hash_mem(char *buf, int length, int bits) +static inline unsigned long hash_mem(char const *buf, int length, int bits) { - unsigned long hash = 0; - unsigned long l = 0; - int len = 0; - unsigned char c; - do { - if (len == length) { - c = (char)len; len = -1; - } else - c = *buf++; - l = (l << 8) | c; - len++; - if ((len & (BITS_PER_LONG/8-1))==0) - hash = hash_long(hash^l, BITS_PER_LONG); - } while (len); - return hash >> (BITS_PER_LONG - bits); + return full_name_hash(buf, length) >> (32 - bits); } #endif /* __KERNEL__ */ -- cgit v0.10.2 From 92d567740f2ab5937b2c23bee94ea4b284bb1f98 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 26 May 2016 22:22:01 -0400 Subject: Change hash_64() return value to 32 bits That's all that's ever asked for, and it makes the return type of hash_long() consistent. It also allows (upcoming patch) an optimized implementation of hash_64 on 32-bit machines. I tried adding a BUILD_BUG_ON to ensure the number of bits requested was never more than 32 (most callers use a compile-time constant), but adding to breaks the tools/perf compiler unless tools/perf/MANIFEST is updated, and understanding that code base well enough to update it is too much trouble. I did the rest of an allyesconfig build with such a check, and nothing tripped. Signed-off-by: George Spelvin diff --git a/include/linux/hash.h b/include/linux/hash.h index 79c52fa..f967ded 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -48,7 +48,7 @@ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull -static __always_inline u64 hash_64(u64 val, unsigned int bits) +static __always_inline u32 hash_64(u64 val, unsigned int bits) { u64 hash = val; @@ -72,7 +72,7 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits) #endif /* High bits are more random, so use them. */ - return hash >> (64 - bits); + return (u32)(hash >> (64 - bits)); } static inline u32 hash_32(u32 val, unsigned int bits) @@ -84,7 +84,7 @@ static inline u32 hash_32(u32 val, unsigned int bits) return hash >> (32 - bits); } -static inline unsigned long hash_ptr(const void *ptr, unsigned int bits) +static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } -- cgit v0.10.2 From ef703f49a6c5b909a85149bb6625c4ed0d697186 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 26 May 2016 23:00:23 -0400 Subject: Eliminate bad hash multipliers from hash_32() and hash_64() The "simplified" prime multipliers made very bad hash functions, so get rid of them. This completes the work of 689de1d6ca. To avoid the inefficiency which was the motivation for the "simplified" multipliers, hash_64() on 32-bit systems is changed to use a different algorithm. It makes two calls to hash_32() instead. drivers/media/usb/dvb-usb-v2/af9015.c uses the old GOLDEN_RATIO_PRIME_32 for some horrible reason, so it inherits a copy of the old definition. Signed-off-by: George Spelvin Cc: Antti Palosaari Cc: Mauro Carvalho Chehab diff --git a/drivers/media/usb/dvb-usb-v2/af9015.c b/drivers/media/usb/dvb-usb-v2/af9015.c index 95a7388..09e0f58 100644 --- a/drivers/media/usb/dvb-usb-v2/af9015.c +++ b/drivers/media/usb/dvb-usb-v2/af9015.c @@ -398,6 +398,8 @@ error: } #define AF9015_EEPROM_SIZE 256 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* hash (and dump) eeprom */ static int af9015_eeprom_hash(struct dvb_usb_device *d) diff --git a/include/linux/hash.h b/include/linux/hash.h index f967ded..613cfde 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -3,85 +3,65 @@ /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ -/* - * Knuth recommends primes in approximately golden ratio to the maximum - * integer representable by a machine word for multiplicative hashing. - * Chuck Lever verified the effectiveness of this technique: - * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf - * - * These primes are chosen to be bit-sparse, that is operations on - * them can use shifts and additions instead of multiplications for - * machines where multiplications are slow. - */ - #include #include -/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ -#define GOLDEN_RATIO_PRIME_32 0x9e370001UL -/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ -#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL - +/* + * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and + * fs/inode.c. It's not actually prime any more (the previous primes + * were actively bad for hashing), but the name remains. + */ #if BITS_PER_LONG == 32 -#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) -#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* - * The above primes are actively bad for hashing, since they are - * too sparse. The 32-bit one is mostly ok, the 64-bit one causes - * real problems. Besides, the "prime" part is pointless for the - * multiplicative hash. + * This hash multiplies the input by a large odd number and takes the + * high bits. Since multiplication propagates changes to the most + * significant end only, it is essential that the high bits of the + * product be used for the hash value. + * + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice - * properties. + * properties. (See Knuth vol 3, section 6.4, exercise 9.) * - * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2. - * (See Knuth vol 3, section 6.4, exercise 9.) + * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, + * which is very slightly easier to multiply by and makes no + * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull -static __always_inline u32 hash_64(u64 val, unsigned int bits) -{ - u64 hash = val; - -#if BITS_PER_LONG == 64 - hash = hash * GOLDEN_RATIO_64; -#else - /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ - u64 n = hash; - n <<= 18; - hash -= n; - n <<= 33; - hash -= n; - n <<= 3; - hash += n; - n <<= 3; - hash -= n; - n <<= 4; - hash += n; - n <<= 2; - hash += n; -#endif - /* High bits are more random, so use them. */ - return (u32)(hash >> (64 - bits)); +static inline u32 __hash_32(u32 val) +{ + return val * GOLDEN_RATIO_32; } static inline u32 hash_32(u32 val, unsigned int bits) { - /* On some cpus multiply is faster, on others gcc will do shifts */ - u32 hash = val * GOLDEN_RATIO_PRIME_32; - /* High bits are more random, so use them. */ - return hash >> (32 - bits); + return __hash_32(val) >> (32 - bits); +} + +static __always_inline u32 hash_64(u64 val, unsigned int bits) +{ +#if BITS_PER_LONG == 64 + /* 64x64-bit multiply is efficient on all 64-bit processors */ + return val * GOLDEN_RATIO_64 >> (64 - bits); +#else + /* Hash 64 bits using only 32x32-bit multiply. */ + return hash_32((u32)val ^ __hash_32(val >> 32), bits); +#endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) @@ -89,6 +69,7 @@ static inline u32 hash_ptr(const void *ptr, unsigned int bits) return hash_long((unsigned long)ptr, bits); } +/* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; -- cgit v0.10.2 From 2a18da7a9c7886f1c7307f8d3f23f24318583f03 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Mon, 23 May 2016 07:43:58 -0400 Subject: fs/namei.c: Improve dcache hash function Patch 0fed3ac866 improved the hash mixing, but the function is slower than necessary; there's a 7-instruction dependency chain (10 on x86) each loop iteration. Word-at-a-time access is a very tight loop (which is good, because link_path_walk() is one of the hottest code paths in the entire kernel), and the hash mixing function must not have a longer latency to avoid slowing it down. There do not appear to be any published fast hash functions that: 1) Operate on the input a word at a time, and 2) Don't need to know the length of the input beforehand, and 3) Have a single iterated mixing function, not needing conditional branches or unrolling to distinguish different loop iterations. One of the algorithms which comes closest is Yann Collet's xxHash, but that's two dependent multiplies per word, which is too much. The key insights in this design are: 1) Barring expensive ops like multiplies, to diffuse one input bit across 64 bits of hash state takes at least log2(64) = 6 sequentially dependent instructions. That is more cycles than we'd like. 2) An operation like "hash ^= hash << 13" requires a second temporary register anyway, and on a 2-operand machine like x86, it's three instructions. 3) A better use of a second register is to hold a two-word hash state. With careful design, no temporaries are needed at all, so it doesn't increase register pressure. And this gets rid of register copying on 2-operand machines, so the code is smaller and faster. 4) Using two words of state weakens the requirement for one-round mixing; we now have two rounds of mixing before cancellation is possible. 5) A two-word hash state also allows operations on both halves to be done in parallel, so on a superscalar processor we get more mixing in fewer cycles. I ended up using a mixing function inspired by the ChaCha and Speck round functions. It is 6 simple instructions and 3 cycles per iteration (assuming multiply by 9 can be done by an "lea" instruction): x ^= *input++; y ^= x; x = ROL(x, K1); x += y; y = ROL(y, K2); y *= 9; Not only is this reversible, two consecutive rounds are reversible: if you are given the initial and final states, but not the intermediate state, it is possible to compute both input words. This means that at least 3 words of input are required to create a collision. (It also has the property, used by hash_name() to avoid a branch, that it hashes all-zero to all-zero.) The rotate constants K1 and K2 were found by experiment. The search took a sample of random initial states (I used 1023) and considered the effect of flipping each of the 64 input bits on each of the 128 output bits two rounds later. Each of the 8192 pairs can be considered a biased coin, and adding up the Shannon entropy of all of them produces a score. The best-scoring shifts also did well in other tests (flipping bits in y, trying 3 or 4 rounds of mixing, flipping all 64*63/2 pairs of input bits), so the choice was made with the additional constraint that the sum of the shifts is odd and not too close to the word size. The final state is then folded into a 32-bit hash value by a less carefully optimized multiply-based scheme. This also has to be fast, as pathname components tend to be short (the most common case is one iteration!), but there's some room for latency, as there is a fair bit of intervening logic before the hash value is used for anything. (Performance verified with "bonnie++ -s 0 -n 1536:-2" on tmpfs. I need a better benchmark; the numbers seem to show a slight dip in performance between 4.6.0 and this patch, but they're too noisy to quote.) Special thanks to Bruce fields for diligent testing which uncovered a nasty fencepost error in an earlier version of this patch. [checkpatch.pl formatting complaints noted and respectfully disagreed with.] Signed-off-by: George Spelvin Tested-by: J. Bruce Fields diff --git a/fs/namei.c b/fs/namei.c index dd98d43..a49cbd7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1788,44 +1789,89 @@ static int walk_component(struct nameidata *nd, int flags) #include #ifdef CONFIG_64BIT - -static inline unsigned int fold_hash(unsigned long hash) -{ - return hash_64(hash, 32); -} +/* + * Register pressure in the mixing function is an issue, particularly + * on 32-bit x86, but almost any function requires one state value and + * one temporary. Instead, use a function designed for two state values + * and no temporaries. + * + * This function cannot create a collision in only two iterations, so + * we have two iterations to achieve avalanche. In those two iterations, + * we have six layers of mixing, which is enough to spread one bit's + * influence out to 2^6 = 64 state bits. + * + * Rotate constants are scored by considering either 64 one-bit input + * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the + * probability of that delta causing a change to each of the 128 output + * bits, using a sample of random initial states. + * + * The Shannon entropy of the computed probabilities is then summed + * to produce a score. Ideally, any input change has a 50% chance of + * toggling any given output bit. + * + * Mixing scores (in bits) for (12,45): + * Input delta: 1-bit 2-bit + * 1 round: 713.3 42542.6 + * 2 rounds: 2753.7 140389.8 + * 3 rounds: 5954.1 233458.2 + * 4 rounds: 7862.6 256672.2 + * Perfect: 8192 258048 + * (64*128) (64*63/2 * 128) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol64(x,12),\ + x += y, y = rol64(y,45),\ + y *= 9 ) /* - * This is George Marsaglia's XORSHIFT generator. - * It implements a maximum-period LFSR in only a few - * instructions. It also has the property (required - * by hash_name()) that mix_hash(0) = 0. + * Fold two longs into one 32-bit hash value. This must be fast, but + * latency isn't quite as critical, as there is a fair bit of additional + * work done before the hash value is used. */ -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 7; - hash ^= hash << 17; - return hash; + y ^= x * GOLDEN_RATIO_64; + y *= GOLDEN_RATIO_64; + return y >> 32; } #else /* 32-bit case */ -#define fold_hash(x) (x) +/* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 17; - hash ^= hash << 5; - return hash; + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); } #endif -/* Return the hash of a string of known length */ +/* + * Return the hash of a string of known length. This is carfully + * designed to match hash_name(), which is the more critical function. + * In particular, we must end by hashing a final word containing 0..7 + * payload bytes, to match the way that hash_name() iterates until it + * finds the delimiter after the name. + */ unsigned int full_name_hash(const char *name, unsigned int len) { - unsigned long a, hash = 0; + unsigned long a, x = 0, y = 0; for (;;) { if (!len) @@ -1833,36 +1879,34 @@ unsigned int full_name_hash(const char *name, unsigned int len) a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } - hash += a & bytemask_from_count(len); + x ^= a & bytemask_from_count(len); done: - return fold_hash(hash); + return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const char *name) { - unsigned long a, adata, mask, hash, len; + unsigned long a = 0, x = 0, y = 0, adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - hash = a = 0; len = -sizeof(unsigned long); do { - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); - hash += a & zero_bytemask(mask); - len += find_zero(mask); + x ^= a & zero_bytemask(mask); - return hashlen_create(fold_hash(hash), len); + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); @@ -1872,13 +1916,12 @@ EXPORT_SYMBOL(hashlen_string); */ static inline u64 hash_name(const char *name) { - unsigned long a, b, adata, bdata, mask, hash, len; + unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - hash = a = 0; len = -sizeof(unsigned long); do { - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); @@ -1886,15 +1929,13 @@ static inline u64 hash_name(const char *name) adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); - mask = create_zero_mask(adata | bdata); + x ^= a & zero_bytemask(mask); - hash += a & zero_bytemask(mask); - len += find_zero(mask); - return hashlen_create(fold_hash(hash), len); + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } -#else +#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const char *name, unsigned int len) @@ -1965,7 +2006,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) int type; err = may_lookup(nd); - if (err) + if (err) return err; hash_len = hash_name(name); -- cgit v0.10.2 From 468a9428521e7d00fb21250af363eb94dc1d6861 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 26 May 2016 22:11:51 -0400 Subject: : Add support for architecture-specific functions This is just the infrastructure; there are no users yet. This is modelled on CONFIG_ARCH_RANDOM; a CONFIG_ symbol declares the existence of . That file may define its own versions of various functions, and define HAVE_* symbols (no CONFIG_ prefix!) to suppress the generic ones. Included is a self-test (in lib/test_hash.c) that verifies the basics. It is NOT in general required that the arch-specific functions compute the same thing as the generic, but if a HAVE_* symbol is defined with the value 1, then equality is tested. Signed-off-by: George Spelvin Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Andreas Schwab Cc: Philippe De Muyter Cc: linux-m68k@lists.linux-m68k.org Cc: Alistair Francis Cc: Michal Simek Cc: Yoshinori Sato Cc: uclinux-h8-devel@lists.sourceforge.jp diff --git a/arch/Kconfig b/arch/Kconfig index 81869a5..96406e4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -589,6 +589,14 @@ config HAVE_STACK_VALIDATION Architecture supports the 'objtool check' host tool command, which performs compile-time stack metadata validation. +config HAVE_ARCH_HASH + bool + default n + help + If this is set, the architecture provides an + file which provides platform-specific implementations of some + functions in or fs/namei.c. + # # ABI hall of shame # diff --git a/fs/namei.c b/fs/namei.c index a49cbd7..968dae0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1788,7 +1788,11 @@ static int walk_component(struct nameidata *nd, int flags) #include -#ifdef CONFIG_64BIT +#ifdef HASH_MIX + +/* Architecture provides HASH_MIX and fold_hash() in */ + +#elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and diff --git a/include/linux/hash.h b/include/linux/hash.h index 613cfde..ad6fa21 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -41,19 +41,40 @@ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull +#ifdef CONFIG_HAVE_ARCH_HASH +/* This header may use the GOLDEN_RATIO_xx constants */ +#include +#endif -static inline u32 __hash_32(u32 val) +/* + * The _generic versions exist only so lib/test_hash.c can compare + * the arch-optimized versions with the generic. + * + * Note that if you change these, any that aren't updated + * to match need to have their HAVE_ARCH_* define values updated so the + * self-test will not false-positive. + */ +#ifndef HAVE_ARCH__HASH_32 +#define __hash_32 __hash_32_generic +#endif +static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } -static inline u32 hash_32(u32 val, unsigned int bits) +#ifndef HAVE_ARCH_HASH_32 +#define hash_32 hash_32_generic +#endif +static inline u32 hash_32_generic(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } -static __always_inline u32 hash_64(u64 val, unsigned int bits) +#ifndef HAVE_ARCH_HASH_64 +#define hash_64 hash_64_generic +#endif +static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 1e9a607..18ec69b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1815,6 +1815,17 @@ config TEST_RHASHTABLE If unsure, say N. +config TEST_HASH + tristate "Perform selftest on hash functions" + default n + help + Enable this option to test the kernel's integer () + and string () hash functions on boot + (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + endmenu # runtime tests config PROVIDE_OHCI1394_DMA_INIT diff --git a/lib/Makefile b/lib/Makefile index 7bd6fd4..f80b1a1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o +obj-$(CONFIG_TEST_HASH) += test_hash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o diff --git a/lib/test_hash.c b/lib/test_hash.c new file mode 100644 index 0000000..c9549c8 --- /dev/null +++ b/lib/test_hash.c @@ -0,0 +1,250 @@ +/* + * Test cases for and + * This just verifies that various ways of computing a hash + * produce the same thing and, for cases where a k-bit hash + * value is requested, is of the requested size. + * + * We fill a buffer with a 255-byte null-terminated string, + * and use both full_name_hash() and hashlen_string() to hash the + * substrings from i to j, where 0 <= i < j < 256. + * + * The returned values are used to check that __hash_32() and + * __hash_32_generic() compute the same thing. Likewise hash_32() + * and hash_64(). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n" + +#include +#include +#include +#include +#include +#include + +/* 32-bit XORSHIFT generator. Seed must not be zero. */ +static u32 __init __attribute_const__ +xorshift(u32 seed) +{ + seed ^= seed << 13; + seed ^= seed >> 17; + seed ^= seed << 5; + return seed; +} + +/* Given a non-zero x, returns a non-zero byte. */ +static u8 __init __attribute_const__ +mod255(u32 x) +{ + x = (x & 0xffff) + (x >> 16); /* 1 <= x <= 0x1fffe */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x2fd */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0x100 */ + x = (x & 0xff) + (x >> 8); /* 1 <= x <= 0xff */ + return x; +} + +/* Fill the buffer with non-zero bytes. */ +static void __init +fill_buf(char *buf, size_t len, u32 seed) +{ + size_t i; + + for (i = 0; i < len; i++) { + seed = xorshift(seed); + buf[i] = mod255(seed); + } +} + +/* + * Test the various integer hash functions. h64 (or its low-order bits) + * is the integer to hash. hash_or accumulates the OR of the hash values, + * which are later checked to see that they cover all the requested bits. + * + * Because these functions (as opposed to the string hashes) are all + * inline, the code being tested is actually in the module, and you can + * recompile and re-test the module without rebooting. + */ +static bool __init +test_int_hash(unsigned long long h64, u32 hash_or[2][33]) +{ + int k; + u32 h0 = (u32)h64, h1, h2; + + /* Test __hash32 */ + hash_or[0][0] |= h1 = __hash_32(h0); +#ifdef HAVE_ARCH__HASH_32 + hash_or[1][0] |= h2 = __hash_32_generic(h0); +#if HAVE_ARCH__HASH_32 == 1 + if (h1 != h2) { + pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", + h0, h1, h2); + return false; + } +#endif +#endif + + /* Test k = 1..32 bits */ + for (k = 1; k <= 32; k++) { + u32 const m = ((u32)2 << (k-1)) - 1; /* Low k bits set */ + + /* Test hash_32 */ + hash_or[0][k] |= h1 = hash_32(h0, k); + if (h1 > m) { + pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_32 + h2 = hash_32_generic(h0, k); +#if HAVE_ARCH_HASH_32 == 1 + if (h1 != h2) { + pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() " + " = %#x", h0, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_32_generic(%#x, %d) = %#x > %#x", + h0, k, h1, m); + return false; + } +#endif +#endif + /* Test hash_64 */ + hash_or[1][k] |= h1 = hash_64(h64, k); + if (h1 > m) { + pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m); + return false; + } +#ifdef HAVE_ARCH_HASH_64 + h2 = hash_64_generic(h64, k); +#if HAVE_ARCH_HASH_64 == 1 + if (h1 != h2) { + pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() " + "= %#x", h64, k, h1, h2); + return false; + } +#else + if (h2 > m) { + pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", + h64, k, h1, m); + return false; + } +#endif +#endif + } + + (void)h2; /* Suppress unused variable warning */ + return true; +} + +#define SIZE 256 /* Run time is cubic in SIZE */ + +static int __init +test_hash_init(void) +{ + char buf[SIZE+1]; + u32 string_or = 0, hash_or[2][33] = { 0 }; + unsigned tests = 0; + unsigned long long h64 = 0; + int i, j; + + fill_buf(buf, SIZE, 1); + + /* Test every possible non-empty substring in the buffer. */ + for (j = SIZE; j > 0; --j) { + buf[j] = '\0'; + + for (i = 0; i <= j; i++) { + u64 hashlen = hashlen_string(buf+i); + u32 h0 = full_name_hash(buf+i, j-i); + + /* Check that hashlen_string gets the length right */ + if (hashlen_len(hashlen) != j-i) { + pr_err("hashlen_string(%d..%d) returned length" + " %u, expected %d", + i, j, hashlen_len(hashlen), j-i); + return -EINVAL; + } + /* Check that the hashes match */ + if (hashlen_hash(hashlen) != h0) { + pr_err("hashlen_string(%d..%d) = %08x != " + "full_name_hash() = %08x", + i, j, hashlen_hash(hashlen), h0); + return -EINVAL; + } + + string_or |= h0; + h64 = h64 << 32 | h0; /* For use with hash_64 */ + if (!test_int_hash(h64, hash_or)) + return -EINVAL; + tests++; + } /* i */ + } /* j */ + + /* The OR of all the hash values should cover all the bits */ + if (~string_or) { + pr_err("OR of all string hash results = %#x != %#x", + string_or, -1u); + return -EINVAL; + } + if (~hash_or[0][0]) { + pr_err("OR of all __hash_32 results = %#x != %#x", + hash_or[0][0], -1u); + return -EINVAL; + } +#ifdef HAVE_ARCH__HASH_32 +#if HAVE_ARCH__HASH_32 != 1 /* Test is pointless if results match */ + if (~hash_or[1][0]) { + pr_err("OR of all __hash_32_generic results = %#x != %#x", + hash_or[1][0], -1u); + return -EINVAL; + } +#endif +#endif + + /* Likewise for all the i-bit hash values */ + for (i = 1; i <= 32; i++) { + u32 const m = ((u32)2 << (i-1)) - 1; /* Low i bits set */ + + if (hash_or[0][i] != m) { + pr_err("OR of all hash_32(%d) results = %#x " + "(%#x expected)", i, hash_or[0][i], m); + return -EINVAL; + } + if (hash_or[1][i] != m) { + pr_err("OR of all hash_64(%d) results = %#x " + "(%#x expected)", i, hash_or[1][i], m); + return -EINVAL; + } + } + + /* Issue notices about skipped tests. */ +#ifndef HAVE_ARCH__HASH_32 + pr_info("__hash_32() has no arch implementation to test."); +#elif HAVE_ARCH__HASH_32 != 1 + pr_info("__hash_32() is arch-specific; not compared to generic."); +#endif +#ifndef HAVE_ARCH_HASH_32 + pr_info("hash_32() has no arch implementation to test."); +#elif HAVE_ARCH_HASH_32 != 1 + pr_info("hash_32() is arch-specific; not compared to generic."); +#endif +#ifndef HAVE_ARCH_HASH_64 + pr_info("hash_64() has no arch implementation to test."); +#elif HAVE_ARCH_HASH_64 != 1 + pr_info("hash_64() is arch-specific; not compared to generic."); +#endif + + pr_notice("%u tests passed.", tests); + + return 0; +} + +static void __exit test_hash_exit(void) +{ +} + +module_init(test_hash_init); /* Does everything */ +module_exit(test_hash_exit); /* Does nothing */ + +MODULE_LICENSE("GPL"); -- cgit v0.10.2 From 14c44b95b3dcb8ff1d627e6b78f57c4373d375cb Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 26 May 2016 11:36:19 -0400 Subject: m68k: Add This provides a multiply by constant GOLDEN_RATIO_32 = 0x61C88647 for the original mc68000, which lacks a 32x32-bit multiply instruction. Yes, the amount of optimization effort put in is excessive. :-) Shift-add chain found by Yevgen Voronenko's Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html Signed-off-by: George Spelvin Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Andreas Schwab Cc: Philippe De Muyter Cc: linux-m68k@lists.linux-m68k.org diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 0dfcf12..bf3de46 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -40,6 +40,7 @@ config M68000 select CPU_HAS_NO_MULDIV64 select CPU_HAS_NO_UNALIGNED select GENERIC_CSUM + select HAVE_ARCH_HASH help The Freescale (was Motorola) 68000 CPU is the first generation of the well known M68K family of processors. The CPU core as well as diff --git a/arch/m68k/include/asm/hash.h b/arch/m68k/include/asm/hash.h new file mode 100644 index 0000000..6407af8 --- /dev/null +++ b/arch/m68k/include/asm/hash.h @@ -0,0 +1,59 @@ +#ifndef _ASM_HASH_H +#define _ASM_HASH_H + +/* + * If CONFIG_M68000=y (original mc68000/010), this file is #included + * to work around the lack of a MULU.L instruction. + */ + +#define HAVE_ARCH__HASH_32 1 +/* + * While it would be legal to substitute a different hash operation + * entirely, let's keep it simple and just use an optimized multiply + * by GOLDEN_RATIO_32 = 0x61C88647. + * + * The best way to do that appears to be to multiply by 0x8647 with + * shifts and adds, and use mulu.w to multiply the high half by 0x61C8. + * + * Because the 68000 has multi-cycle shifts, this addition chain is + * chosen to minimise the shift distances. + * + * Despite every attempt to spoon-feed it simple operations, GCC + * 6.1.1 doggedly insists on doing annoying things like converting + * "lsl.l #2," (12 cycles) to two adds (8+8 cycles). + * + * It also likes to notice two shifts in a row, like "a = x << 2" and + * "a <<= 7", and convert that to "a = x << 9". But shifts longer + * than 8 bits are extra-slow on m68k, so that's a lose. + * + * Since the 68000 is a very simple in-order processor with no + * instruction scheduling effects on execution time, we can safely + * take it out of GCC's hands and write one big asm() block. + * + * Without calling overhead, this operation is 30 bytes (14 instructions + * plus one immediate constant) and 166 cycles. + * + * (Because %2 is fetched twice, it can't be postincrement, and thus it + * can't be a fully general "g" or "m". Register is preferred, but + * offsettable memory or immediate will work.) + */ +static inline u32 __attribute_const__ __hash_32(u32 x) +{ + u32 a, b; + + asm( "move.l %2,%0" /* a = x * 0x0001 */ + "\n lsl.l #2,%0" /* a = x * 0x0004 */ + "\n move.l %0,%1" + "\n lsl.l #7,%0" /* a = x * 0x0200 */ + "\n add.l %2,%0" /* a = x * 0x0201 */ + "\n add.l %0,%1" /* b = x * 0x0205 */ + "\n add.l %0,%0" /* a = x * 0x0402 */ + "\n add.l %0,%1" /* b = x * 0x0607 */ + "\n lsl.l #5,%0" /* a = x * 0x8040 */ + : "=&d,d" (a), "=&r,r" (b) + : "r,roi?" (x)); /* a+b = x*0x8647 */ + + return ((u16)(x*0x61c8) << 16) + a + b; +} + +#endif /* _ASM_HASH_H */ -- cgit v0.10.2 From 7b13277b682972c2ff8f6419e86c333d81936023 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 25 May 2016 11:06:09 -0400 Subject: microblaze: Add Microblaze is an FPGA soft core that can be configured various ways. If it is configured without a multiplier, the standard __hash_32() will require a call to __mulsi3, which is a slow software loop. Instead, use a shift-and-add sequence for the constant multiply. GCC knows how to do this, but it's not as clever as some. Signed-off-by: George Spelvin Cc: Alistair Francis Cc: Michal Simek diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 3d793b5..ce3e512 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -16,6 +16,7 @@ config MICROBLAZE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select HAVE_ARCH_HASH select HAVE_ARCH_KGDB select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG diff --git a/arch/microblaze/include/asm/hash.h b/arch/microblaze/include/asm/hash.h new file mode 100644 index 0000000..753513a --- /dev/null +++ b/arch/microblaze/include/asm/hash.h @@ -0,0 +1,81 @@ +#ifndef _ASM_HASH_H +#define _ASM_HASH_H + +/* + * Fortunately, most people who want to run Linux on Microblaze enable + * both multiplier and barrel shifter, but omitting them is technically + * a supported configuration. + * + * With just a barrel shifter, we can implement an efficient constant + * multiply using shifts and adds. GCC can find a 9-step solution, but + * this 6-step solution was found by Yevgen Voronenko's implementation + * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html. + * + * That software is really not designed for a single multiplier this large, + * but if you run it enough times with different seeds, it'll find several + * 6-shift, 6-add sequences for computing x * 0x61C88647. They are all + * c = (x << 19) + x; + * a = (x << 9) + c; + * b = (x << 23) + a; + * return (a<<11) + (b<<6) + (c<<3) - b; + * with variations on the order of the final add. + * + * Without even a shifter, it's hopless; any hash function will suck. + */ + +#if CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL == 0 + +#define HAVE_ARCH__HASH_32 1 + +/* Multiply by GOLDEN_RATIO_32 = 0x61C88647 */ +static inline u32 __attribute_const__ __hash_32(u32 a) +{ +#if CONFIG_XILINX_MICROBLAZE0_USE_BARREL + unsigned int b, c; + + /* Phase 1: Compute three intermediate values */ + b = a << 23; + c = (a << 19) + a; + a = (a << 9) + c; + b += a; + + /* Phase 2: Compute (a << 11) + (b << 6) + (c << 3) - b */ + a <<= 5; + a += b; /* (a << 5) + b */ + a <<= 3; + a += c; /* (a << 8) + (b << 3) + c */ + a <<= 3; + return a - b; /* (a << 11) + (b << 6) + (c << 3) - b */ +#else + /* + * "This is really going to hurt." + * + * Without a barrel shifter, left shifts are implemented as + * repeated additions, and the best we can do is an optimal + * addition-subtraction chain. This one is not known to be + * optimal, but at 37 steps, it's decent for a 31-bit multiplier. + * + * Question: given its size (37*4 = 148 bytes per instance), + * and slowness, is this worth having inline? + */ + unsigned int b, c, d; + + b = a << 4; /* 4 */ + c = b << 1; /* 1 5 */ + b += a; /* 1 6 */ + c += b; /* 1 7 */ + c <<= 3; /* 3 10 */ + c -= a; /* 1 11 */ + d = c << 7; /* 7 18 */ + d += b; /* 1 19 */ + d <<= 8; /* 8 27 */ + d += a; /* 1 28 */ + d <<= 1; /* 1 29 */ + d += b; /* 1 30 */ + d <<= 6; /* 6 36 */ + return d + c; /* 1 37 total instructions*/ +#endif +} + +#endif /* !CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL */ +#endif /* _ASM_HASH_H */ -- cgit v0.10.2 From 4684fe95300c071983f77653e354c040fe80a265 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 25 May 2016 14:19:49 -0400 Subject: h8300: Add This will improve the performance of hash_32() and hash_64(), but due to complete lack of multi-bit shift instructions on H8, performance will still be bad in surrounding code. Designing H8-specific hash algorithms to work around that is a separate project. (But if the maintainers would like to get in touch...) Signed-off-by: George Spelvin Cc: Yoshinori Sato Cc: uclinux-h8-devel@lists.sourceforge.jp diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 986ea84..6c583db 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -20,6 +20,7 @@ config H8300 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO select HAVE_ARCH_KGDB + select HAVE_ARCH_HASH config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/h8300/include/asm/hash.h b/arch/h8300/include/asm/hash.h new file mode 100644 index 0000000..04cfbd2 --- /dev/null +++ b/arch/h8300/include/asm/hash.h @@ -0,0 +1,53 @@ +#ifndef _ASM_HASH_H +#define _ASM_HASH_H + +/* + * The later H8SX models have a 32x32-bit multiply, but the H8/300H + * and H8S have only 16x16->32. Since it's tolerably compact, this is + * basically an inlined version of the __mulsi3 code. Since the inputs + * are not expected to be small, it's also simplfied by skipping the + * early-out checks. + * + * (Since neither CPU has any multi-bit shift instructions, a + * shift-and-add version is a non-starter.) + * + * TODO: come up with an arch-specific version of the hashing in fs/namei.c, + * since that is heavily dependent on rotates. Which, as mentioned, suck + * horribly on H8. + */ + +#if defined(CONFIG_CPU_H300H) || defined(CONFIG_CPU_H8S) + +#define HAVE_ARCH__HASH_32 1 + +/* + * Multiply by k = 0x61C88647. Fitting this into three registers requires + * one extra instruction, but reducing register pressure will probably + * make that back and then some. + * + * GCC asm note: %e1 is the high half of operand %1, while %f1 is the + * low half. So if %1 is er4, then %e1 is e4 and %f1 is r4. + * + * This has been designed to modify x in place, since that's the most + * common usage, but preserve k, since hash_64() makes two calls in + * quick succession. + */ +static inline u32 __attribute_const__ __hash_32(u32 x) +{ + u32 temp; + + asm( "mov.w %e1,%f0" + "\n mulxu.w %f2,%0" /* klow * xhigh */ + "\n mov.w %f0,%e1" /* The extra instruction */ + "\n mov.w %f1,%f0" + "\n mulxu.w %e2,%0" /* khigh * xlow */ + "\n add.w %e1,%f0" + "\n mulxu.w %f2,%1" /* klow * xlow */ + "\n add.w %f0,%e1" + : "=&r" (temp), "=r" (x) + : "%r" (GOLDEN_RATIO_32), "1" (x)); + return x; +} + +#endif +#endif /* _ASM_HASH_H */ -- cgit v0.10.2