Speaking of superfast preliminary stage of whether it LOOKS-LIKE (thus bypassing SHA-like invocations), in Nakamichi I use my own DoubleDeuceAES inspired by @jandrewrogers.
I use it for reducing 18,36,64 bytes long keys down to 16 bytes, if a collision occurs then it can be strengthened by adding 4 or 8 or even 16 bytes, it is not such a pain since those 32 bytes will be generated brutally fast:
#ifdef _NAquaHash
// https://github.com/jandrewrogers/AquaHash/blob/master/aquahash.h
#include <wmmintrin.h>
// or may be just use #include <x86intrin.h> for all
#endif
#ifdef _NAquaHash
void DoubleDeuceAES(const uint8_t *buffer, const size_t length)
{
uint32_t i;
char MaxTo64a[64];
char MaxTo64b[64];
char MaxTo64c[64/1];
char MaxTo64d[64/1];
__m128i hashA = _mm_setzero_si128();
__m128i hashB = _mm_setzero_si128();
__m128i hashC = _mm_setzero_si128();
__m128i hashD = _mm_setzero_si128();
const __m128i *ptr128a;
const __m128i *ptr128b;
const __m128i *ptr128c;
const __m128i *ptr128d;
memset(MaxTo64a,0,4*(128/8)); // padding the keys to be multiples of 128, up to 64 bytes
memset(MaxTo64b,0,4*(128/8)); // padding the keys to be multiples of 128, up to 64 bytes
memcpy( MaxTo64a, buffer, length ); // Relax, no problema with padding since all the keys in Leprechaun/Nakamichi are to be put in respective pools with fixed length, so a key of len 4 AAAA{padded with 60 ASCII 000} is not as of len 5 AAAA{ASCII 000}{padded with 59 ASCII 000}, despite having same hashes they won't collide!
for (i = 0; i < length; i++) {
MaxTo64b[63-i]=MaxTo64a[i];
}
// Make C a derivative of A, interleaved HALFWARD left-to-right, with BYTE granularity - [BYTE00][BYTE01]...[BYTE31] | [BYTE32]...[BYTE63] as [BYTE00][BYTE32]...[BYTE31][BYTE63]
// Make D a derivative of B, interleaved HALFWARD left-to-right, with BYTE granularity - [BYTE00][BYTE01]...[BYTE31] | [BYTE32]...[BYTE63] as [BYTE00][BYTE32]...[BYTE31][BYTE63]
for (i = 0; i < (64>>1)/1; i++) { // 64/2/BYTE=31 i.e 0..31
MaxTo64c[(i<<1)+0]=MaxTo64a[i+0]; // a: 00,32 / 01,33 / ...31,63
MaxTo64c[(i<<1)+1]=MaxTo64a[i+32]; // c: 0*2+0,0*2+1 / 1*2+0,1*2+1 / 2*2+0,2*2+1 which is 0,1 / 2,3 / 4,5
MaxTo64d[(i<<1)+0]=MaxTo64b[i+0];
MaxTo64d[(i<<1)+1]=MaxTo64b[i+32];
}
ptr128a=(__m128i *)MaxTo64a;
ptr128b=(__m128i *)MaxTo64b;
ptr128c=(__m128i *)MaxTo64c;
ptr128d=(__m128i *)MaxTo64d;
for (i = 0; i < 64 / 16; i++) {
__m128i a = _mm_loadu_si128(ptr128a++);
__m128i b = _mm_loadu_si128(ptr128b++);
__m128i c = _mm_loadu_si128(ptr128c++);
__m128i d = _mm_loadu_si128(ptr128d++);
hashA = _mm_aesenc_si128(hashA, a);
hashB = _mm_aesenc_si128(hashB, b);
hashC = _mm_aesenc_si128(hashC, c);
hashD = _mm_aesenc_si128(hashD, d);
}
hashA = _mm_aesenc_si128(hashA, hashB);
hashA = _mm_aesenc_si128(hashA, hashC);
hashA = _mm_aesenc_si128(hashA, hashD);
SlowCopy128bit( (const char *)(&hashA), (char *)&DDAES[0]);
//void SlowCopy128bit (const char *SOURCE, char *TARGET) { _mm_storeu_si128((__m128i *)(TARGET), _mm_loadu_si128((const __m128i *)(SOURCE))); }
}
#endif
It is 625 bytes of code, I'm not aware of other 128bit hash being so fast, hope you find it useful as I do:
; mark_description "Intel(R) C++ Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140726";
?DoubleDeuceAES@@YAXPEBE_K@Z PROC
; parameter 1: rcx
; parameter 2: rdx
.B14.1::
00000 41 57 push r15
00002 48 81 ec 00 01
00 00 sub rsp, 256
00009 49 89 d7 mov r15, rdx
0000c 44 0f 29 bc 24
a0 00 00 00 movaps XMMWORD PTR [160+rsp], xmm15
00015 66 0f ef c0 pxor xmm0, xmm0
00019 44 0f 29 b4 24
b0 00 00 00 movaps XMMWORD PTR [176+rsp], xmm14
00022 66 0f ef c9 pxor xmm1, xmm1
00026 44 0f 29 ac 24
c0 00 00 00 movaps XMMWORD PTR [192+rsp], xmm13
0002f 66 0f ef d2 pxor xmm2, xmm2
00033 44 0f 29 a4 24
d0 00 00 00 movaps XMMWORD PTR [208+rsp], xmm12
0003c 66 0f ef db pxor xmm3, xmm3
00040 44 0f 29 9c 24
e0 00 00 00 movaps XMMWORD PTR [224+rsp], xmm11
00049 48 89 ca mov rdx, rcx
0004c 44 0f 29 94 24
f0 00 00 00 movaps XMMWORD PTR [240+rsp], xmm10
00055 0f 29 44 24 20 movaps XMMWORD PTR [32+rsp], xmm0
0005a 0f 29 4c 24 30 movaps XMMWORD PTR [48+rsp], xmm1
0005f 0f 29 54 24 40 movaps XMMWORD PTR [64+rsp], xmm2
00064 0f 29 5c 24 50 movaps XMMWORD PTR [80+rsp], xmm3
.B14.2::
00069 66 0f ef c0 pxor xmm0, xmm0
0006d 66 0f ef c9 pxor xmm1, xmm1
00071 0f 29 44 24 60 movaps XMMWORD PTR [96+rsp], xmm0
00076 66 0f ef d2 pxor xmm2, xmm2
0007a 0f 29 4c 24 70 movaps XMMWORD PTR [112+rsp], xmm1
0007f 66 0f ef db pxor xmm3, xmm3
00083 0f 29 94 24 80
00 00 00 movaps XMMWORD PTR [128+rsp], xmm2
0008b 0f 29 9c 24 90
00 00 00 movaps XMMWORD PTR [144+rsp], xmm3
.B14.3::
00093 4d 89 f8 mov r8, r15
00096 48 8d 4c 24 20 lea rcx, QWORD PTR [32+rsp]
0009b e8 fc ff ff ff call _intel_fast_memcpy
.B14.4::
000a0 4d 85 ff test r15, r15
000a3 76 5b jbe .B14.11
.B14.5::
000a5 4c 89 f8 mov rax, r15
000a8 b9 01 00 00 00 mov ecx, 1
000ad 48 d1 e8 shr rax, 1
000b0 33 d2 xor edx, edx
000b2 48 85 c0 test rax, rax
000b5 76 33 jbe .B14.9
.B14.7::
000b7 8d 0c 12 lea ecx, DWORD PTR [rdx+rdx]
000ba 41 89 ca mov r10d, ecx
000bd f7 d9 neg ecx
000bf 46 8a 5c 14 20 mov r11b, BYTE PTR [32+rsp+r10]
000c4 44 8d 49 3f lea r9d, DWORD PTR [63+rcx]
000c8 83 c1 3e add ecx, 62
000cb 46 88 5c 0c 60 mov BYTE PTR [96+rsp+r9], r11b
000d0 44 8d 4c 12 01 lea r9d, DWORD PTR [1+rdx+rdx]
000d5 ff c2 inc edx
000d7 48 3b d0 cmp rdx, rax
000da 46 8a 4c 0c 20 mov r9b, BYTE PTR [32+rsp+r9]
000df 44 88 4c 0c 60 mov BYTE PTR [96+rsp+rcx], r9b
000e4 72 d1 jb .B14.7
.B14.8::
000e6 8d 4c 12 01 lea ecx, DWORD PTR [1+rdx+rdx]
.B14.9::
000ea ff c9 dec ecx
000ec 89 c8 mov eax, ecx
000ee 49 3b c7 cmp rax, r15
000f1 73 0d jae .B14.11
.B14.10::
000f3 f7 d9 neg ecx
000f5 83 c1 3f add ecx, 63
000f8 8a 44 04 20 mov al, BYTE PTR [32+rsp+rax]
000fc 88 44 0c 60 mov BYTE PTR [96+rsp+rcx], al
.B14.11::
00100 66 0f 6f 44 24
20 movdqa xmm0, XMMWORD PTR [32+rsp]
00106 66 0f 6f 4c 24
40 movdqa xmm1, XMMWORD PTR [64+rsp]
0010c 66 44 0f 6f f8 movdqa xmm15, xmm0
00111 66 44 0f 60 f9 punpcklbw xmm15, xmm1
00116 66 45 0f ef f6 pxor xmm14, xmm14
0011b 66 0f 68 c1 punpckhbw xmm0, xmm1
0011f 66 0f 6f 4c 24
60 movdqa xmm1, XMMWORD PTR [96+rsp]
00125 66 0f 6f 94 24
80 00 00 00 movdqa xmm2, XMMWORD PTR [128+rsp]
0012e 66 44 0f 6f e9 movdqa xmm13, xmm1
00133 66 0f 6f 5c 24
30 movdqa xmm3, XMMWORD PTR [48+rsp]
00139 66 0f 6f 64 24
50 movdqa xmm4, XMMWORD PTR [80+rsp]
0013f 66 0f 6f 6c 24
70 movdqa xmm5, XMMWORD PTR [112+rsp]
00145 66 44 0f 60 ea punpcklbw xmm13, xmm2
0014a 66 0f 68 ca punpckhbw xmm1, xmm2
0014e 66 0f 6f d3 movdqa xmm2, xmm3
00152 44 0f 10 5c 24
20 movups xmm11, XMMWORD PTR [32+rsp]
00158 44 0f 10 64 24
60 movups xmm12, XMMWORD PTR [96+rsp]
0015e 66 44 0f 6f 94
24 90 00 00 00 movdqa xmm10, XMMWORD PTR [144+rsp]
00168 66 0f 60 d4 punpcklbw xmm2, xmm4
0016c 66 0f 68 dc punpckhbw xmm3, xmm4
00170 66 0f 6f e5 movdqa xmm4, xmm5
00174 66 41 0f 60 e2 punpcklbw xmm4, xmm10
00179 66 41 0f 68 ea punpckhbw xmm5, xmm10
0017e 66 45 0f ef d2 pxor xmm10, xmm10
00183 66 45 0f 38 dc
d3 aesenc xmm10, xmm11
00189 66 45 0f ef db pxor xmm11, xmm11
0018e 66 45 0f 38 dc
dc aesenc xmm11, xmm12
00194 66 45 0f ef e4 pxor xmm12, xmm12
00199 66 45 0f 38 dc
e7 aesenc xmm12, xmm15
0019f 44 0f 10 7c 24
30 movups xmm15, XMMWORD PTR [48+rsp]
001a5 66 45 0f 38 dc
f5 aesenc xmm14, xmm13
001ab 44 0f 10 6c 24
70 movups xmm13, XMMWORD PTR [112+rsp]
001b1 66 45 0f 38 dc
d7 aesenc xmm10, xmm15
001b7 44 0f 10 7c 24
40 movups xmm15, XMMWORD PTR [64+rsp]
001bd 66 45 0f 38 dc
dd aesenc xmm11, xmm13
001c3 44 0f 10 ac 24
80 00 00 00 movups xmm13, XMMWORD PTR [128+rsp]
001cc 66 45 0f 38 dc
d7 aesenc xmm10, xmm15
001d2 44 0f 10 7c 24
50 movups xmm15, XMMWORD PTR [80+rsp]
001d8 66 45 0f 38 dc
dd aesenc xmm11, xmm13
001de 44 0f 10 ac 24
90 00 00 00 movups xmm13, XMMWORD PTR [144+rsp]
001e7 66 44 0f 38 dc
e0 aesenc xmm12, xmm0
001ed 66 45 0f 38 dc
d7 aesenc xmm10, xmm15
001f3 66 45 0f 38 dc
dd aesenc xmm11, xmm13
001f9 66 44 0f 38 dc
e2 aesenc xmm12, xmm2
001ff 66 44 0f 38 dc
f1 aesenc xmm14, xmm1
00205 66 45 0f 38 dc
d3 aesenc xmm10, xmm11
0020b 66 44 0f 38 dc
e3 aesenc xmm12, xmm3
00211 66 44 0f 38 dc
f4 aesenc xmm14, xmm4
00217 66 45 0f 38 dc
d4 aesenc xmm10, xmm12
0021d 66 44 0f 38 dc
f5 aesenc xmm14, xmm5
00223 66 45 0f 38 dc
d6 aesenc xmm10, xmm14
00229 44 0f 11 15 00
00 00 00 movups XMMWORD PTR [?DDAES@@3PAEA], xmm10
00231 44 0f 28 94 24
f0 00 00 00 movaps xmm10, XMMWORD PTR [240+rsp]
0023a 44 0f 28 9c 24
e0 00 00 00 movaps xmm11, XMMWORD PTR [224+rsp]
00243 44 0f 28 a4 24
d0 00 00 00 movaps xmm12, XMMWORD PTR [208+rsp]
0024c 44 0f 28 ac 24
c0 00 00 00 movaps xmm13, XMMWORD PTR [192+rsp]
00255 44 0f 28 b4 24
b0 00 00 00 movaps xmm14, XMMWORD PTR [176+rsp]
0025e 44 0f 28 bc 24
a0 00 00 00 movaps xmm15, XMMWORD PTR [160+rsp]
00267 48 81 c4 00 01
00 00 add rsp, 256
0026e 41 5f pop r15
00270 c3 ret
00271 0f 1f 84 00 00
00 00 00 0f 1f
80 00 00 00 00 ALIGN 16
.B14.12::
?DoubleDeuceAES@@YAXPEBE_K@Z ENDP
Currently I am running heavy compression on 4GB and 10GB corpora using my DoubleDeuceAES, very glad so far...