Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HashLib/src/Crypto/HlpBlake3Dispatch.pas
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ procedure Blake3_HashMany_Scalar(AInput, AKey, AOut: Pointer;
// Process 16 blocks per chunk
for LBlock := 0 to 15 do
begin
// Convert block bytes to words (little-endian, which is native on x86)
// Convert block bytes to words (assume little-endian)
System.Move(LPInput^, LBlockWords[0], 64);

// Set flags for this block
Expand Down
8 changes: 6 additions & 2 deletions HashLib/src/Include/HashLib.inc
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@

{============================ Common SIMD Settings ============================}

{$IF DEFINED(HASHLIB_X86_64) AND NOT DEFINED(HASHLIB_MSWINDOWS)}
{$DEFINE HASHLIB_SYSV_X64_ABI}
{$IFEND}

// Uncomment to force scalar dispatch (available on all platforms):
// {$DEFINE HASHLIB_FORCE_SCALAR}

Expand All @@ -158,7 +162,7 @@
OR (DEFINED(HASHLIB_FORCE_SSE3) AND (DEFINED(HASHLIB_FORCE_SSSE3) OR DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42)))
OR (DEFINED(HASHLIB_FORCE_SSSE3) AND (DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42)))
OR (DEFINED(HASHLIB_FORCE_SSE41) AND DEFINED(HASHLIB_FORCE_SSE42))}
{$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (x86 level) define may be enabled at a time.'}
{$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (X86 Level) define may be enabled at a time.'}
{$IFEND}

{$ENDIF}
Expand All @@ -171,7 +175,7 @@
{$IF (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_NEON))
OR (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SVE))
OR (DEFINED(HASHLIB_FORCE_NEON) AND DEFINED(HASHLIB_FORCE_SVE))}
{$MESSAGE ERROR 'Only one HASHLIB_FORCE_* define may be enabled at a time.'}
{$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (Arm Level) define may be enabled at a time.'}
{$IFEND}

{$ENDIF}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// ASums layout: [SumA: UInt32, SumB: UInt32].
// Constants layout: [weights: 32B, ones_16: 32B] at offsets 0 and 32.
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
// Uses ymm0-ymm5 only (all volatile on Windows x64, no saves needed).
// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed).
// Weights and ones are reloaded from memory each iteration to avoid
// using non-volatile ymm registers.
// AVX/AVX2 instructions are db-encoded for broad assembler compatibility.
Expand Down
43 changes: 9 additions & 34 deletions HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@
// Processes num_blocks x 32-byte blocks; caller applies mod 65521.
//
// x64 uses xmm6-xmm9 for widened weights; IA-32 only has xmm0-xmm7, so widened weights
// live on stack (4 x 16 bytes). xmm6–xmm7 are non-volatile on Win32 and are saved there.
// live on stack (4 x 16 bytes). xmm6–xmm7 saved/restored defensively (volatile on i386).
//
// Same SSE2 emulation as Adler32BlocksSse2_x86_64.inc (punpcklbw/hbw + pmaddwd).
//
// Stack layout (sub esp, 96):
// [esp + 0..15]: xmm6 save
// [esp + 16..31]: xmm7 save
// [esp + 32..47]: w0 (weights_hi low)
// [esp + 48..63]: w1 (weights_hi high)
// [esp + 64..79]: w2 (weights_lo low)
// [esp + 80..95]: w3 (weights_lo high)

// Preserve constants pointer (eax) before GPR reloads from ASums
push eax
Expand All @@ -23,7 +31,6 @@
pxor xmm0, xmm0 // v_s1 = 0
pxor xmm3, xmm3 // zero for unpack / psadbw

{$IFDEF MSWINDOWS}
sub esp, 96
movdqu oword ptr [esp], xmm6
movdqu oword ptr [esp + $10], xmm7
Expand All @@ -43,24 +50,6 @@
movdqa xmm6, xmm5
punpckhbw xmm6, xmm3
movdqu oword ptr [esp + $50], xmm6 // w3
{$ELSE}
sub esp, 64
mov edx, dword ptr [esp + 64]
movdqu xmm4, oword ptr [edx]
movdqa xmm6, xmm4
punpcklbw xmm6, xmm3
movdqu oword ptr [esp], xmm6
movdqa xmm6, xmm4
punpckhbw xmm6, xmm3
movdqu oword ptr [esp + $10], xmm6
movdqu xmm5, oword ptr [edx + 16]
movdqa xmm6, xmm5
punpcklbw xmm6, xmm3
movdqu oword ptr [esp + $20], xmm6
movdqa xmm6, xmm5
punpckhbw xmm6, xmm3
movdqu oword ptr [esp + $30], xmm6
{$ENDIF}

@adler32_sse2_loop:
paddd xmm2, xmm0
Expand All @@ -74,13 +63,8 @@
movdqa xmm5, xmm4
punpcklbw xmm5, xmm3
punpckhbw xmm4, xmm3
{$IFDEF MSWINDOWS}
movdqu xmm6, oword ptr [esp + $20]
movdqu xmm7, oword ptr [esp + $30]
{$ELSE}
movdqu xmm6, oword ptr [esp]
movdqu xmm7, oword ptr [esp + $10]
{$ENDIF}
pmaddwd xmm5, xmm6
pmaddwd xmm4, xmm7
paddd xmm5, xmm4
Expand All @@ -95,13 +79,8 @@
movdqa xmm5, xmm4
punpcklbw xmm5, xmm3
punpckhbw xmm4, xmm3
{$IFDEF MSWINDOWS}
movdqu xmm6, oword ptr [esp + $40]
movdqu xmm7, oword ptr [esp + $50]
{$ELSE}
movdqu xmm6, oword ptr [esp + $20]
movdqu xmm7, oword ptr [esp + $30]
{$ENDIF}
pmaddwd xmm5, xmm6
pmaddwd xmm4, xmm7
paddd xmm5, xmm4
Expand Down Expand Up @@ -130,13 +109,9 @@
mov dword ptr [edi], eax
mov dword ptr [edi + 4], esi

{$IFDEF MSWINDOWS}
movdqu xmm6, oword ptr [esp]
movdqu xmm7, oword ptr [esp + $10]
add esp, 96
{$ELSE}
add esp, 64
{$ENDIF}
add esp, 4 // discard saved AConstants
pop edi
pop esi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,18 @@
// ASums layout: [SumA: UInt32, SumB: UInt32].
// Constants layout: [weights_hi: 16B, weights_lo: 16B] (only first 32 bytes used).
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
// Uses xmm0-xmm9 (xmm6-xmm9 saved/restored on Windows).
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
//
// Emulates SSSE3 pmaddubsw via punpcklbw/punpckhbw + pmaddwd:
// data bytes are zero-extended to i16, then multiplied with pre-widened
// weight bytes via pmaddwd (SSE2), producing the same 4 x i32 weighted
// sums per 16-byte half that pmaddubsw + pmaddwd would yield.

{$IFDEF MSWINDOWS}
sub rsp, 64
movdqu oword [rsp], xmm6
movdqu oword [rsp + $10], xmm7
movdqu oword [rsp + $20], xmm8
movdqu oword [rsp + $30], xmm9
{$ENDIF}

// Zero constant
pxor xmm3, xmm3
Expand Down Expand Up @@ -104,10 +102,8 @@
mov dword [r8], eax
mov dword [r8 + 4], r10d

{$IFDEF MSWINDOWS}
movdqu xmm6, oword [rsp]
movdqu xmm7, oword [rsp + $10]
movdqu xmm8, oword [rsp + $20]
movdqu xmm9, oword [rsp + $30]
add rsp, 64
{$ENDIF}
8 changes: 2 additions & 6 deletions HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// Constants: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B] (48 bytes; same as x64 SSSE3).
// No xmm8 on IA-32: psadbw uses a copy in xmm5 (first half) or xmm4 (second half), then reload weights.
// Caller applies mod 65521.
//
// xmm6–xmm7 saved/restored defensively (volatile on i386).

push eax

Expand All @@ -17,14 +19,10 @@
pxor xmm0, xmm0
pxor xmm3, xmm3

{$IFDEF MSWINDOWS}
sub esp, 32
movdqu oword ptr [esp], xmm6
movdqu oword ptr [esp + $10], xmm7
mov edx, dword ptr [esp + 32]
{$ELSE}
mov edx, dword ptr [esp]
{$ENDIF}

movdqu xmm4, oword ptr [edx]
movdqu xmm5, oword ptr [edx + 16]
Expand Down Expand Up @@ -76,11 +74,9 @@
mov dword ptr [edi], eax
mov dword ptr [edi + 4], esi

{$IFDEF MSWINDOWS}
movdqu xmm6, oword ptr [esp]
movdqu xmm7, oword ptr [esp + $10]
add esp, 32
{$ENDIF}
add esp, 4
pop edi
pop esi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
// ASums layout: [SumA: UInt32, SumB: UInt32].
// Constants layout: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B].
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
// Uses xmm0-xmm8 (xmm6-xmm8 saved/restored on Windows).
// Uses xmm0-xmm8; xmm6-xmm8 are MS x64 non-volatile (saved/restored).

{$IFDEF MSWINDOWS}
sub rsp, 48
movdqu oword [rsp], xmm6
movdqu oword [rsp + $10], xmm7
movdqu oword [rsp + $20], xmm8
{$ENDIF}

// Load constants
movdqu xmm4, oword [r9]
Expand Down Expand Up @@ -81,9 +79,7 @@
mov dword [r8], eax
mov dword [r8 + 4], r10d

{$IFDEF MSWINDOWS}
movdqu xmm6, oword [rsp]
movdqu xmm7, oword [rsp + $10]
movdqu xmm8, oword [rsp + $20]
add rsp, 48
{$ENDIF}
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
// AVX/AVX2 instructions are db-encoded for broad assembler compatibility.
// Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1).
// Each pointer addresses 128 QWords (1024 bytes).
// Uses ymm0-ymm9. Non-volatile ymm6-ymm9 saved/restored on Windows.
// Uses ymm0-ymm9; ymm6-ymm9 are MS x64 non-volatile (saved/restored).
// Register map during G rounds: ymm0 = A(v0..v3), ymm1 = B(v4..v7),
// ymm2 = C(v8..v11), ymm3 = D(v12..v15), ymm4-ymm5 = temps.
// Stack layout (sub rsp, 2184):
// [rsp+0..127] ymm6-9 save area (Windows only, 4 * 32 = 128 bytes)
// [rsp+0..127] ymm6-9 save area (4 * 32 = 128 bytes)
// [rsp+128..1151] R_buf (1024 bytes)
// [rsp+1152..2175] Z_buf (1024 bytes)
// [rsp+2176..2183] alignment padding
Expand All @@ -17,12 +17,10 @@

sub rsp, 2184

{$IFDEF MSWINDOWS}
db $C5, $FE, $7F, $34, $24 // vmovdqu yword [rsp], ymm6
db $C5, $FE, $7F, $7C, $24, $20 // vmovdqu yword [rsp + $20], ymm7
db $C5, $7E, $7F, $44, $24, $40 // vmovdqu yword [rsp + $40], ymm8
db $C5, $7E, $7F, $4C, $24, $60 // vmovdqu yword [rsp + $60], ymm9
{$ENDIF}

// =========================================================================
// Step 1: Compute R_buf = Left XOR Right, store at [rsp+128]
Expand Down Expand Up @@ -328,12 +326,10 @@
jb @final_xor_loop

@epilogue:
{$IFDEF MSWINDOWS}
db $C5, $FE, $6F, $34, $24 // vmovdqu ymm6, yword [rsp]
db $C5, $FE, $6F, $7C, $24, $20 // vmovdqu ymm7, yword [rsp + $20]
db $C5, $7E, $6F, $44, $24, $40 // vmovdqu ymm8, yword [rsp + $40]
db $C5, $7E, $6F, $4C, $24, $60 // vmovdqu ymm9, yword [rsp + $60]
{$ENDIF}

add rsp, 2184
db $C5, $F8, $77 // vzeroupper
6 changes: 1 addition & 5 deletions HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// IA-32: after SimdProc4Begin_i386 — ebx, esi, edi, eax = Left, Right, Current, WithXor
// (parallel to MS x64 ABI: rcx, rdx, r8, r9).
// Each pointer addresses 128 QWords (1024 bytes).
// Uses xmm0–xmm7 only. Non-volatile xmm6–xmm7 saved/restored on Windows (MSWINDOWS).
// Uses xmm0–xmm7; xmm6–xmm7 saved/restored defensively (volatile on i386).
// Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7),
// xmm4-5 = C(8..11), xmm6-7 = D(12..15) / temps (same roles as x64, fewer XMM).
// IA-32 stack (sub esp, 2132): WithXor at [esp+2128]; spill slots [esp+2080],[esp+2096],[esp+2112];
Expand All @@ -15,10 +15,8 @@

mov dword ptr [esp + 2128], eax

{$IFDEF MSWINDOWS}
movdqu oword ptr [esp], xmm6
movdqu oword ptr [esp + 16], xmm7
{$ENDIF}

// =========================================================================
// Step 1: Compute R_buf = Left XOR Right, store at [esp+32]
Expand Down Expand Up @@ -729,10 +727,8 @@
jb @final_xor_loop

@epilogue:
{$IFDEF MSWINDOWS}
movdqu xmm6, oword ptr [esp]
movdqu xmm7, oword ptr [esp + 16]
{$ENDIF}

add esp, 2132
pop edi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// SSE2 implementation of Argon2 FillBlock.
// Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1).
// Each pointer addresses 128 QWords (1024 bytes).
// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows.
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
// Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7),
// xmm4-5 = C(8..11), xmm6-7 = D(12..15), xmm8-9 = temps.
// Stack layout (sub rsp, 2120):
// [rsp+0..63] xmm6-9 save area (Windows only)
// [rsp+0..63] xmm6-9 save area
// [rsp+64..1087] R_buf (1024 bytes)
// [rsp+1088..2111] Z_buf (1024 bytes)
// [rsp+2112..2119] alignment padding
Expand All @@ -18,12 +18,10 @@

sub rsp, 2120

{$IFDEF MSWINDOWS}
movdqu oword [rsp], xmm6
movdqu oword [rsp + $10], xmm7
movdqu oword [rsp + $20], xmm8
movdqu oword [rsp + $30], xmm9
{$ENDIF}

// =========================================================================
// Step 1: Compute R_buf = Left XOR Right, store at [rsp+64]
Expand Down Expand Up @@ -558,11 +556,9 @@
jb @final_xor_loop

@epilogue:
{$IFDEF MSWINDOWS}
movdqu xmm6, oword [rsp]
movdqu xmm7, oword [rsp + $10]
movdqu xmm8, oword [rsp + $20]
movdqu xmm9, oword [rsp + $30]
{$ENDIF}

add rsp, 2120
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// AVX2 implementation of BLAKE2b compress (fully unrolled 12 rounds).
// Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr.
// Uses ymm0-ymm5 only (all volatile on Windows x64).
// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed).
// Register map: ymm0 = a (v0-3), ymm1 = b (v4-7), ymm2 = c (v8-11), ymm3 = d (v12-15),
// ymm4 = message temp, ymm5 = computation temp.
// Rotations: ROT32 via vpshufd, ROT16/24/63 via shift+or.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
// SSE2 implementation of BLAKE2b compress (fully unrolled 12 rounds).
// Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr.
// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows.
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
// Register map: xmm0-1 = row1 (v0-3), xmm2-3 = row2 (v4-7),
// xmm4-5 = row3 (v8-11), xmm6-7 = row4 (v12-15), xmm8-9 = temps.
// Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves.

{$IFDEF MSWINDOWS}
sub rsp, 64
movdqu oword [rsp], xmm6
movdqu oword [rsp + $10], xmm7
movdqu oword [rsp + $20], xmm8
movdqu oword [rsp + $30], xmm9
{$ENDIF}

// Initialize working vector
movdqu xmm0, oword [rcx]
Expand Down Expand Up @@ -1995,10 +1993,8 @@
movdqu oword [rcx + $20], xmm2
movdqu oword [rcx + $30], xmm3

{$IFDEF MSWINDOWS}
movdqu xmm6, oword [rsp]
movdqu xmm7, oword [rsp + $10]
movdqu xmm8, oword [rsp + $20]
movdqu xmm9, oword [rsp + $30]
add rsp, 64
{$ENDIF}
Loading
Loading