diff --git a/HashLib/src/Crypto/HlpBlake3Dispatch.pas b/HashLib/src/Crypto/HlpBlake3Dispatch.pas index 9f9c732..869e5de 100644 --- a/HashLib/src/Crypto/HlpBlake3Dispatch.pas +++ b/HashLib/src/Crypto/HlpBlake3Dispatch.pas @@ -579,7 +579,7 @@ procedure Blake3_HashMany_Scalar(AInput, AKey, AOut: Pointer; // Process 16 blocks per chunk for LBlock := 0 to 15 do begin - // Convert block bytes to words (little-endian, which is native on x86) + // Convert block bytes to words (assume little-endian) System.Move(LPInput^, LBlockWords[0], 64); // Set flags for this block diff --git a/HashLib/src/Include/HashLib.inc b/HashLib/src/Include/HashLib.inc index c139edd..25d19cb 100644 --- a/HashLib/src/Include/HashLib.inc +++ b/HashLib/src/Include/HashLib.inc @@ -134,6 +134,10 @@ {============================ Common SIMD Settings ============================} +{$IF DEFINED(HASHLIB_X86_64) AND NOT DEFINED(HASHLIB_MSWINDOWS)} + {$DEFINE HASHLIB_SYSV_X64_ABI} +{$IFEND} + // Uncomment to force scalar dispatch (available on all platforms): // {$DEFINE HASHLIB_FORCE_SCALAR} @@ -158,7 +162,7 @@ OR (DEFINED(HASHLIB_FORCE_SSE3) AND (DEFINED(HASHLIB_FORCE_SSSE3) OR DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) OR (DEFINED(HASHLIB_FORCE_SSSE3) AND (DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) OR (DEFINED(HASHLIB_FORCE_SSE41) AND DEFINED(HASHLIB_FORCE_SSE42))} - {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (x86 level) define may be enabled at a time.'} + {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (X86 Level) define may be enabled at a time.'} {$IFEND} {$ENDIF} @@ -171,7 +175,7 @@ {$IF (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_NEON)) OR (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SVE)) OR (DEFINED(HASHLIB_FORCE_NEON) AND DEFINED(HASHLIB_FORCE_SVE))} - {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* define may be enabled at a time.'} + {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (Arm Level) define may be enabled at a time.'} {$IFEND} {$ENDIF} diff --git a/HashLib/src/Include/Simd/Adler32/Adler32BlocksAvx2_x86_64.inc b/HashLib/src/Include/Simd/Adler32/Adler32BlocksAvx2_x86_64.inc index a9892a8..aeff0cc 100644 --- a/HashLib/src/Include/Simd/Adler32/Adler32BlocksAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Adler32/Adler32BlocksAvx2_x86_64.inc @@ -3,7 +3,7 @@ // ASums layout: [SumA: UInt32, SumB: UInt32]. // Constants layout: [weights: 32B, ones_16: 32B] at offsets 0 and 32. // Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it). -// Uses ymm0-ymm5 only (all volatile on Windows x64, no saves needed). +// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed). // Weights and ones are reloaded from memory each iteration to avoid // using non-volatile ymm registers. // AVX/AVX2 instructions are db-encoded for broad assembler compatibility. diff --git a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc index 62a2346..6e9b1da 100644 --- a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc +++ b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc @@ -4,9 +4,17 @@ // Processes num_blocks x 32-byte blocks; caller applies mod 65521. // // x64 uses xmm6-xmm9 for widened weights; IA-32 only has xmm0-xmm7, so widened weights -// live on stack (4 x 16 bytes). xmm6–xmm7 are non-volatile on Win32 and are saved there. +// live on stack (4 x 16 bytes). xmm6–xmm7 saved/restored defensively (volatile on i386). // // Same SSE2 emulation as Adler32BlocksSse2_x86_64.inc (punpcklbw/hbw + pmaddwd). +// +// Stack layout (sub esp, 96): +// [esp + 0..15]: xmm6 save +// [esp + 16..31]: xmm7 save +// [esp + 32..47]: w0 (weights_hi low) +// [esp + 48..63]: w1 (weights_hi high) +// [esp + 64..79]: w2 (weights_lo low) +// [esp + 80..95]: w3 (weights_lo high) // Preserve constants pointer (eax) before GPR reloads from ASums push eax @@ -23,7 +31,6 @@ pxor xmm0, xmm0 // v_s1 = 0 pxor xmm3, xmm3 // zero for unpack / psadbw -{$IFDEF MSWINDOWS} sub esp, 96 movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 @@ -43,24 +50,6 @@ movdqa xmm6, xmm5 punpckhbw xmm6, xmm3 movdqu oword ptr [esp + $50], xmm6 // w3 -{$ELSE} - sub esp, 64 - mov edx, dword ptr [esp + 64] - movdqu xmm4, oword ptr [edx] - movdqa xmm6, xmm4 - punpcklbw xmm6, xmm3 - movdqu oword ptr [esp], xmm6 - movdqa xmm6, xmm4 - punpckhbw xmm6, xmm3 - movdqu oword ptr [esp + $10], xmm6 - movdqu xmm5, oword ptr [edx + 16] - movdqa xmm6, xmm5 - punpcklbw xmm6, xmm3 - movdqu oword ptr [esp + $20], xmm6 - movdqa xmm6, xmm5 - punpckhbw xmm6, xmm3 - movdqu oword ptr [esp + $30], xmm6 -{$ENDIF} @adler32_sse2_loop: paddd xmm2, xmm0 @@ -74,13 +63,8 @@ movdqa xmm5, xmm4 punpcklbw xmm5, xmm3 punpckhbw xmm4, xmm3 -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp + $20] movdqu xmm7, oword ptr [esp + $30] -{$ELSE} - movdqu xmm6, oword ptr [esp] - movdqu xmm7, oword ptr [esp + $10] -{$ENDIF} pmaddwd xmm5, xmm6 pmaddwd xmm4, xmm7 paddd xmm5, xmm4 @@ -95,13 +79,8 @@ movdqa xmm5, xmm4 punpcklbw xmm5, xmm3 punpckhbw xmm4, xmm3 -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp + $40] movdqu xmm7, oword ptr [esp + $50] -{$ELSE} - movdqu xmm6, oword ptr [esp + $20] - movdqu xmm7, oword ptr [esp + $30] -{$ENDIF} pmaddwd xmm5, xmm6 pmaddwd xmm4, xmm7 paddd xmm5, xmm4 @@ -130,13 +109,9 @@ mov dword ptr [edi], eax mov dword ptr [edi + 4], esi -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] add esp, 96 -{$ELSE} - add esp, 64 -{$ENDIF} add esp, 4 // discard saved AConstants pop edi pop esi diff --git a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_x86_64.inc b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_x86_64.inc index 9ea835f..d42e5ce 100644 --- a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_x86_64.inc @@ -3,20 +3,18 @@ // ASums layout: [SumA: UInt32, SumB: UInt32]. // Constants layout: [weights_hi: 16B, weights_lo: 16B] (only first 32 bytes used). // Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it). -// Uses xmm0-xmm9 (xmm6-xmm9 saved/restored on Windows). +// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored). // // Emulates SSSE3 pmaddubsw via punpcklbw/punpckhbw + pmaddwd: // data bytes are zero-extended to i16, then multiplied with pre-widened // weight bytes via pmaddwd (SSE2), producing the same 4 x i32 weighted // sums per 16-byte half that pmaddubsw + pmaddwd would yield. -{$IFDEF MSWINDOWS} sub rsp, 64 movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 movdqu oword [rsp + $30], xmm9 -{$ENDIF} // Zero constant pxor xmm3, xmm3 @@ -104,10 +102,8 @@ mov dword [r8], eax mov dword [r8 + 4], r10d -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] movdqu xmm9, oword [rsp + $30] add rsp, 64 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc index 49d9fcb..f176d2b 100644 --- a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc +++ b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc @@ -3,6 +3,8 @@ // Constants: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B] (48 bytes; same as x64 SSSE3). // No xmm8 on IA-32: psadbw uses a copy in xmm5 (first half) or xmm4 (second half), then reload weights. // Caller applies mod 65521. +// +// xmm6–xmm7 saved/restored defensively (volatile on i386). push eax @@ -17,14 +19,10 @@ pxor xmm0, xmm0 pxor xmm3, xmm3 -{$IFDEF MSWINDOWS} sub esp, 32 movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 mov edx, dword ptr [esp + 32] -{$ELSE} - mov edx, dword ptr [esp] -{$ENDIF} movdqu xmm4, oword ptr [edx] movdqu xmm5, oword ptr [edx + 16] @@ -76,11 +74,9 @@ mov dword ptr [edi], eax mov dword ptr [edi + 4], esi -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] add esp, 32 -{$ENDIF} add esp, 4 pop edi pop esi diff --git a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_x86_64.inc b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_x86_64.inc index d79fc51..0e07a73 100644 --- a/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_x86_64.inc +++ b/HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_x86_64.inc @@ -3,14 +3,12 @@ // ASums layout: [SumA: UInt32, SumB: UInt32]. // Constants layout: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B]. // Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it). -// Uses xmm0-xmm8 (xmm6-xmm8 saved/restored on Windows). +// Uses xmm0-xmm8; xmm6-xmm8 are MS x64 non-volatile (saved/restored). -{$IFDEF MSWINDOWS} sub rsp, 48 movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 -{$ENDIF} // Load constants movdqu xmm4, oword [r9] @@ -81,9 +79,7 @@ mov dword [r8], eax mov dword [r8 + 4], r10d -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] add rsp, 48 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockAvx2_x86_64.inc b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockAvx2_x86_64.inc index 468f5bb..6c3597e 100644 --- a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockAvx2_x86_64.inc @@ -2,11 +2,11 @@ // AVX/AVX2 instructions are db-encoded for broad assembler compatibility. // Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1). // Each pointer addresses 128 QWords (1024 bytes). -// Uses ymm0-ymm9. Non-volatile ymm6-ymm9 saved/restored on Windows. +// Uses ymm0-ymm9; ymm6-ymm9 are MS x64 non-volatile (saved/restored). // Register map during G rounds: ymm0 = A(v0..v3), ymm1 = B(v4..v7), // ymm2 = C(v8..v11), ymm3 = D(v12..v15), ymm4-ymm5 = temps. // Stack layout (sub rsp, 2184): -// [rsp+0..127] ymm6-9 save area (Windows only, 4 * 32 = 128 bytes) +// [rsp+0..127] ymm6-9 save area (4 * 32 = 128 bytes) // [rsp+128..1151] R_buf (1024 bytes) // [rsp+1152..2175] Z_buf (1024 bytes) // [rsp+2176..2183] alignment padding @@ -17,12 +17,10 @@ sub rsp, 2184 -{$IFDEF MSWINDOWS} db $C5, $FE, $7F, $34, $24 // vmovdqu yword [rsp], ymm6 db $C5, $FE, $7F, $7C, $24, $20 // vmovdqu yword [rsp + $20], ymm7 db $C5, $7E, $7F, $44, $24, $40 // vmovdqu yword [rsp + $40], ymm8 db $C5, $7E, $7F, $4C, $24, $60 // vmovdqu yword [rsp + $60], ymm9 -{$ENDIF} // ========================================================================= // Step 1: Compute R_buf = Left XOR Right, store at [rsp+128] @@ -328,12 +326,10 @@ jb @final_xor_loop @epilogue: -{$IFDEF MSWINDOWS} db $C5, $FE, $6F, $34, $24 // vmovdqu ymm6, yword [rsp] db $C5, $FE, $6F, $7C, $24, $20 // vmovdqu ymm7, yword [rsp + $20] db $C5, $7E, $6F, $44, $24, $40 // vmovdqu ymm8, yword [rsp + $40] db $C5, $7E, $6F, $4C, $24, $60 // vmovdqu ymm9, yword [rsp + $60] -{$ENDIF} add rsp, 2184 db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc index 4ca44d9..dac9771 100644 --- a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc +++ b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc @@ -2,7 +2,7 @@ // IA-32: after SimdProc4Begin_i386 — ebx, esi, edi, eax = Left, Right, Current, WithXor // (parallel to MS x64 ABI: rcx, rdx, r8, r9). // Each pointer addresses 128 QWords (1024 bytes). -// Uses xmm0–xmm7 only. Non-volatile xmm6–xmm7 saved/restored on Windows (MSWINDOWS). +// Uses xmm0–xmm7; xmm6–xmm7 saved/restored defensively (volatile on i386). // Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7), // xmm4-5 = C(8..11), xmm6-7 = D(12..15) / temps (same roles as x64, fewer XMM). // IA-32 stack (sub esp, 2132): WithXor at [esp+2128]; spill slots [esp+2080],[esp+2096],[esp+2112]; @@ -15,10 +15,8 @@ mov dword ptr [esp + 2128], eax -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + 16], xmm7 -{$ENDIF} // ========================================================================= // Step 1: Compute R_buf = Left XOR Right, store at [esp+32] @@ -729,10 +727,8 @@ jb @final_xor_loop @epilogue: -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + 16] -{$ENDIF} add esp, 2132 pop edi diff --git a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_x86_64.inc b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_x86_64.inc index b690a85..56557e7 100644 --- a/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_x86_64.inc @@ -1,11 +1,11 @@ // SSE2 implementation of Argon2 FillBlock. // Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1). // Each pointer addresses 128 QWords (1024 bytes). -// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows. +// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored). // Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7), // xmm4-5 = C(8..11), xmm6-7 = D(12..15), xmm8-9 = temps. // Stack layout (sub rsp, 2120): -// [rsp+0..63] xmm6-9 save area (Windows only) +// [rsp+0..63] xmm6-9 save area // [rsp+64..1087] R_buf (1024 bytes) // [rsp+1088..2111] Z_buf (1024 bytes) // [rsp+2112..2119] alignment padding @@ -18,12 +18,10 @@ sub rsp, 2120 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 movdqu oword [rsp + $30], xmm9 -{$ENDIF} // ========================================================================= // Step 1: Compute R_buf = Left XOR Right, store at [rsp+64] @@ -558,11 +556,9 @@ jb @final_xor_loop @epilogue: -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] movdqu xmm9, oword [rsp + $30] -{$ENDIF} add rsp, 2120 diff --git a/HashLib/src/Include/Simd/Blake2B/Blake2BCompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/Blake2B/Blake2BCompressAvx2_x86_64.inc index a8babb5..d507afc 100644 --- a/HashLib/src/Include/Simd/Blake2B/Blake2BCompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake2B/Blake2BCompressAvx2_x86_64.inc @@ -1,6 +1,6 @@ // AVX2 implementation of BLAKE2b compress (fully unrolled 12 rounds). // Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr. -// Uses ymm0-ymm5 only (all volatile on Windows x64). +// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed). // Register map: ymm0 = a (v0-3), ymm1 = b (v4-7), ymm2 = c (v8-11), ymm3 = d (v12-15), // ymm4 = message temp, ymm5 = computation temp. // Rotations: ROT32 via vpshufd, ROT16/24/63 via shift+or. diff --git a/HashLib/src/Include/Simd/Blake2B/Blake2BCompressSse2_x86_64.inc b/HashLib/src/Include/Simd/Blake2B/Blake2BCompressSse2_x86_64.inc index 3345aac..0f4b66b 100644 --- a/HashLib/src/Include/Simd/Blake2B/Blake2BCompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake2B/Blake2BCompressSse2_x86_64.inc @@ -1,17 +1,15 @@ // SSE2 implementation of BLAKE2b compress (fully unrolled 12 rounds). // Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr. -// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows. +// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored). // Register map: xmm0-1 = row1 (v0-3), xmm2-3 = row2 (v4-7), // xmm4-5 = row3 (v8-11), xmm6-7 = row4 (v12-15), xmm8-9 = temps. // Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves. -{$IFDEF MSWINDOWS} sub rsp, 64 movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 movdqu oword [rsp + $30], xmm9 -{$ENDIF} // Initialize working vector movdqu xmm0, oword [rcx] @@ -1995,10 +1993,8 @@ movdqu oword [rcx + $20], xmm2 movdqu oword [rcx + $30], xmm3 -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] movdqu xmm9, oword [rsp + $30] add rsp, 64 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressAvx2_x86_64.inc index 67c4eb5..7e45c52 100644 --- a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressAvx2_x86_64.inc @@ -1,16 +1,14 @@ // AVX2 implementation of BLAKE2s compress (VEX-encoded 128-bit, fully unrolled 10 rounds). // Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr. -// Uses xmm0-xmm6. Non-volatile xmm6 saved/restored on Windows. +// Uses xmm0-xmm6; xmm6 is MS x64 non-volatile (saved/restored). // Register map: xmm0 = a (v0-3), xmm1 = b (v4-7), xmm2 = c (v8-11), xmm3 = d (v12-15), // xmm4 = message buffer, xmm5 = temp, xmm6 = temp (msg load). // 3-operand VEX form eliminates movdqa copies needed in SSE2 rotations. // AVX/AVX2 instructions are db-encoded for broad assembler compatibility. // Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves. -{$IFDEF MSWINDOWS} sub rsp, 16 db $C5, $FA, $7F, $34, $24 // vmovdqu oword [rsp], xmm6 -{$ENDIF} // Initialize working vector db $C5, $FA, $6F, $01 // vmovdqu xmm0, oword [rcx] @@ -994,9 +992,7 @@ db $C5, $FA, $7F, $01 // vmovdqu oword [rcx], xmm0 db $C5, $FA, $7F, $49, $10 // vmovdqu oword [rcx + $10], xmm1 -{$IFDEF MSWINDOWS} db $C5, $FA, $6F, $34, $24 // vmovdqu xmm6, oword [rsp] add rsp, 16 -{$ENDIF} db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_i386.inc b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_i386.inc index 7fcf89f..4cc6e3b 100644 --- a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_i386.inc +++ b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_i386.inc @@ -1,7 +1,7 @@ // SSE2 implementation of BLAKE2s compress (fully unrolled 10 rounds). // IA-32: after SimdProc4Begin_i386 — ebx = state, esi = msg, edi = counter+flags, eax = IV // (parallel to MS x64 ABI: rcx, rdx, r8, r9). -// Uses xmm0–xmm6. Non-volatile xmm6 saved/restored on Windows (MSWINDOWS). +// Uses xmm0–xmm6; xmm6 saved/restored defensively (volatile on i386). // Register map: xmm0 = a (v0-3), xmm1 = b (v4-7), xmm2 = c (v8-11), xmm3 = d (v12-15), // xmm4 = message buffer, xmm5 = temp, xmm6 = temp (msg load). // Rotations: ROT16/12/8/7 via psrld/pslld/por. @@ -9,10 +9,8 @@ // Message loads via movd + punpcklqdq + shufps. // Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves. -{$IFDEF MSWINDOWS} sub esp, 16 movdqu oword ptr [esp], xmm6 -{$ENDIF} // Initialize working vector movdqu xmm0, oword ptr [ebx] @@ -1076,10 +1074,8 @@ movdqu oword ptr [ebx], xmm0 movdqu oword ptr [ebx + $10], xmm1 -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] add esp, 16 -{$ENDIF} pop edi pop esi pop ebx diff --git a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_x86_64.inc b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_x86_64.inc index 3f9c415..0d1cd36 100644 --- a/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake2S/Blake2SCompressSse2_x86_64.inc @@ -1,6 +1,6 @@ // SSE2 implementation of BLAKE2s compress (fully unrolled 10 rounds). // Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr. -// Uses xmm0-xmm6. Non-volatile xmm6 saved/restored on Windows. +// Uses xmm0-xmm6; xmm6 is MS x64 non-volatile (saved/restored). // Register map: xmm0 = a (v0-3), xmm1 = b (v4-7), xmm2 = c (v8-11), xmm3 = d (v12-15), // xmm4 = message buffer, xmm5 = temp, xmm6 = temp (msg load). // Rotations: ROT16/12/8/7 via psrld/pslld/por. @@ -8,10 +8,8 @@ // Message loads via movd + punpcklqdq + shufps. // Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves. -{$IFDEF MSWINDOWS} sub rsp, 16 movdqu oword [rsp], xmm6 -{$ENDIF} // Initialize working vector movdqu xmm0, oword [rcx] @@ -1075,7 +1073,5 @@ movdqu oword [rcx], xmm0 movdqu oword [rcx + $10], xmm1 -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] add rsp, 16 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Blake3/Blake3CompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/Blake3/Blake3CompressAvx2_x86_64.inc index 23003a0..8f6ba71 100644 --- a/HashLib/src/Include/Simd/Blake3/Blake3CompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake3/Blake3CompressAvx2_x86_64.inc @@ -8,8 +8,14 @@ // 3-operand VEX form eliminates movdqa copies needed in SSE2 rotations. // Blend emulations use vshufps+vpshufd (blend_0xCC) and vpunpckhdq+vshufps (blend_0xC0), // avoiding the need for mask constants. +// +// MS x64 non-volatile saves: xmm6, xmm7, xmm8, xmm9, xmm11, xmm14. +// +// Stack layout (sub rsp, 120): +// [rsp + 0.. 95]: xmm6/xmm7/xmm8/xmm9/xmm11/xmm14 save area (6 * 16) +// [rsp + 96..111]: IV staging area (loaded into xmm2) +// [rsp +112..119]: padding -{$IFDEF MSWINDOWS} sub rsp, 120 db $C5, $F9, $7F, $34, $24 // vmovdqa oword [rsp], xmm6 db $C5, $F9, $7F, $7C, $24, $10 // vmovdqa oword [rsp + $10], xmm7 @@ -27,19 +33,6 @@ mov eax, $A54FF53A mov dword ptr [rsp + $6C], eax db $C5, $F9, $6F, $54, $24, $60 // vmovdqa xmm2, oword [rsp + $60] -{$ELSE} - sub rsp, 24 - - mov eax, $6A09E667 - mov dword ptr [rsp], eax - mov eax, $BB67AE85 - mov dword ptr [rsp + $04], eax - mov eax, $3C6EF372 - mov dword ptr [rsp + $08], eax - mov eax, $A54FF53A - mov dword ptr [rsp + $0C], eax - db $C5, $F9, $6F, $14, $24 // vmovdqa xmm2, oword [rsp] -{$ENDIF} // Initialize state db $C4, $C1, $7A, $6F, $00 // vmovdqu xmm0, oword [r8] @@ -593,7 +586,6 @@ db $C5, $FA, $7F, $51, $20 // vmovdqu oword [rcx + $20], xmm2 db $C5, $FA, $7F, $59, $30 // vmovdqu oword [rcx + $30], xmm3 -{$IFDEF MSWINDOWS} db $C5, $F9, $6F, $34, $24 // vmovdqa xmm6, oword [rsp] db $C5, $F9, $6F, $7C, $24, $10 // vmovdqa xmm7, oword [rsp + $10] db $C5, $79, $6F, $44, $24, $20 // vmovdqa xmm8, oword [rsp + $20] @@ -601,6 +593,3 @@ db $C5, $79, $6F, $5C, $24, $40 // vmovdqa xmm11, oword [rsp + $40] db $C5, $79, $6F, $74, $24, $50 // vmovdqa xmm14, oword [rsp + $50] add rsp, 120 -{$ELSE} - add rsp, 24 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Blake3/Blake3CompressSse2_x86_64.inc b/HashLib/src/Include/Simd/Blake3/Blake3CompressSse2_x86_64.inc index 51b1a5d..36a2821 100644 --- a/HashLib/src/Include/Simd/Blake3/Blake3CompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake3/Blake3CompressSse2_x86_64.inc @@ -8,8 +8,14 @@ // Message scheduling: all 16 words kept in xmm4-7, permuted between rounds using // shufps/pshufd (blend_0xCC emulated via shufps+pshufd, blend_0xC0 via punpckhdq+shufps). // Diagonalize/Undiagonalize via pshufd (row1 unrotated, per sneves optimization). +// +// MS x64 non-volatile saves: xmm6, xmm7, xmm8, xmm9, xmm11, xmm14. +// +// Stack layout (sub rsp, 120): +// [rsp + 0.. 95]: xmm6/xmm7/xmm8/xmm9/xmm11/xmm14 save area (6 * 16) +// [rsp + 96..111]: IV staging area (loaded into xmm2) +// [rsp +112..119]: padding -{$IFDEF MSWINDOWS} sub rsp, 120 movdqa oword [rsp], xmm6 movdqa oword [rsp + $10], xmm7 @@ -27,19 +33,6 @@ mov eax, $A54FF53A mov dword ptr [rsp + $6C], eax movdqa xmm2, oword [rsp + $60] -{$ELSE} - sub rsp, 24 - - mov eax, $6A09E667 - mov dword ptr [rsp], eax - mov eax, $BB67AE85 - mov dword ptr [rsp + $04], eax - mov eax, $3C6EF372 - mov dword ptr [rsp + $08], eax - mov eax, $A54FF53A - mov dword ptr [rsp + $0C], eax - movdqa xmm2, oword [rsp] -{$ENDIF} // Initialize state: a=CV[0..3], b=CV[4..7], c=IV[0..3], d=CounterFlags movdqu xmm0, oword [r8] @@ -646,7 +639,6 @@ movdqu oword [rcx + $20], xmm2 movdqu oword [rcx + $30], xmm3 -{$IFDEF MSWINDOWS} movdqa xmm6, oword [rsp] movdqa xmm7, oword [rsp + $10] movdqa xmm8, oword [rsp + $20] @@ -654,6 +646,3 @@ movdqa xmm11, oword [rsp + $40] movdqa xmm14, oword [rsp + $50] add rsp, 120 -{$ELSE} - add rsp, 24 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Blake3/Blake3Hash4Sse2_x86_64.inc b/HashLib/src/Include/Simd/Blake3/Blake3Hash4Sse2_x86_64.inc index 56f3c5b..bbfc6cd 100644 --- a/HashLib/src/Include/Simd/Blake3/Blake3Hash4Sse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake3/Blake3Hash4Sse2_x86_64.inc @@ -12,11 +12,12 @@ // GPR: rcx = input pointer, rbx = block counter (0..15), rbp = block byte offset. // Message transpose: 4x4 via punpckldq/punpckhdq + punpcklqdq/punpckhqdq (pure SSE2). // Rotations: rot16 via pshuflw+pshufhw; rot12/8/7 via pslld/psrld/por. +// +// MS x64 non-volatile saves: xmm6-xmm13. push rbx push rbp -{$IFDEF MSWINDOWS} sub rsp, 712 movdqa oword [rsp + 576], xmm6 movdqa oword [rsp + 592], xmm7 @@ -26,9 +27,6 @@ movdqa oword [rsp + 656], xmm11 movdqa oword [rsp + 672], xmm12 movdqa oword [rsp + 688], xmm13 -{$ELSE} - sub rsp, 584 -{$ENDIF} mov qword ptr [rsp + 560], r8 mov dword ptr [rsp + 568], r11d @@ -1834,7 +1832,6 @@ movdqu oword [rax + 80], xmm4 movdqu oword [rax + 112], xmm3 -{$IFDEF MSWINDOWS} movdqa xmm6, oword [rsp + 576] movdqa xmm7, oword [rsp + 592] movdqa xmm8, oword [rsp + 608] @@ -1844,9 +1841,6 @@ movdqa xmm12, oword [rsp + 672] movdqa xmm13, oword [rsp + 688] add rsp, 712 -{$ELSE} - add rsp, 584 -{$ENDIF} pop rbp pop rbx diff --git a/HashLib/src/Include/Simd/Blake3/Blake3Hash8Avx2_x86_64.inc b/HashLib/src/Include/Simd/Blake3/Blake3Hash8Avx2_x86_64.inc index 832f988..3127f49 100644 --- a/HashLib/src/Include/Simd/Blake3/Blake3Hash8Avx2_x86_64.inc +++ b/HashLib/src/Include/Simd/Blake3/Blake3Hash8Avx2_x86_64.inc @@ -15,6 +15,8 @@ // Output transpose: full 8x8 via vperm2i128 + vpunpckl/hdq + vpunpcklq/hqdq. // 3-operand VEX form eliminates movdqa copies needed in SSE2 rotations. // Stack 32-byte aligned via "and rsp, -32" for YMM movdqu stores. +// +// MS x64 non-volatile saves: xmm6-xmm13. push rbx push rbp @@ -23,7 +25,6 @@ mov r12, rsp and rsp, -32 -{$IFDEF MSWINDOWS} sub rsp, 1280 db $C5, $FA, $7F, $B4, $24, $80, $04, $00, $00 // vmovdqu [rsp+1152], xmm6 db $C5, $FA, $7F, $BC, $24, $90, $04, $00, $00 // vmovdqu [rsp+1168], xmm7 @@ -33,9 +34,6 @@ db $C5, $7A, $7F, $9C, $24, $D0, $04, $00, $00 // vmovdqu [rsp+1232], xmm11 db $C5, $7A, $7F, $A4, $24, $E0, $04, $00, $00 // vmovdqu [rsp+1248], xmm12 db $C5, $7A, $7F, $AC, $24, $F0, $04, $00, $00 // vmovdqu [rsp+1264], xmm13 -{$ELSE} - sub rsp, 1152 -{$ENDIF} mov qword ptr [rsp + 1136], r12 mov qword ptr [rsp + 1120], r8 @@ -1679,7 +1677,6 @@ db $C5, $7E, $7F, $A0, $C0, $00, $00, $00 // vmovdqu [rax+192], ymm12 db $C5, $7E, $7F, $A8, $E0, $00, $00, $00 // vmovdqu [rax+224], ymm13 -{$IFDEF MSWINDOWS} db $C5, $FA, $6F, $B4, $24, $80, $04, $00, $00 // vmovdqu xmm6, [rsp+1152] db $C5, $FA, $6F, $BC, $24, $90, $04, $00, $00 // vmovdqu xmm7, [rsp+1168] db $C5, $7A, $6F, $84, $24, $A0, $04, $00, $00 // vmovdqu xmm8, [rsp+1184] @@ -1688,7 +1685,6 @@ db $C5, $7A, $6F, $9C, $24, $D0, $04, $00, $00 // vmovdqu xmm11, [rsp+1232] db $C5, $7A, $6F, $A4, $24, $E0, $04, $00, $00 // vmovdqu xmm12, [rsp+1248] db $C5, $7A, $6F, $AC, $24, $F0, $04, $00, $00 // vmovdqu xmm13, [rsp+1264] -{$ENDIF} db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/Common/SimdProc1Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc1Begin_i386.inc index 8c8352c..bab2398 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc1Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc1Begin_i386.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 1-parameter assembly functions. // After inclusion: ebx = param1 (from EAX at entry). -// FPC: Windows and Unix i386 use the same parameter entry layout. +// FPC and Delphi Win32 share the same i386 register convention here. // Usage: // procedure MyProc(P1: Pointer); // {$I SimdProc1Begin_i386.inc} @@ -8,9 +8,6 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm mov ebx, eax -{$ELSE} -asm - mov ebx, eax -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc1Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc1Begin_x86_64.inc index 84f1981..c14cf0f 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc1Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc1Begin_x86_64.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 1-parameter assembly functions. // After inclusion: rcx = param1 (MS x64 ABI). -// On FPC non-Windows (Unix ABI), remaps rdi -> rcx. +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi -> rcx. // Usage: // procedure MyProc(P1: Pointer); // {$I SimdProc1Begin_x86_64.inc} @@ -8,11 +8,11 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm - {$IFNDEF MSWINDOWS} - mov rcx, rdi - {$ENDIF} -{$ELSE} -asm +{$IFNDEF FPC} .noframe {$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} + mov rcx, rdi +{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc2Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc2Begin_i386.inc index 2d0fc8d..68070e5 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc2Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc2Begin_i386.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 2-parameter assembly functions. -// After inclusion: ebx = param1, esi = param2 (EAX, EDX at entry; callee saves EBX, ESI). -// FPC: Windows and Unix i386 use the same parameter entry layout. +// After inclusion: ebx = param1, esi = param2 (EAX, EDX at entry). +// Callee-saved EBX, ESI - epilogue must pop ESI, EBX. // Usage: // procedure MyProc(P1, P2: Pointer); // {$I SimdProc2Begin_i386.inc} @@ -8,15 +8,9 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm push ebx push esi mov ebx, eax mov esi, edx -{$ELSE} -asm - push ebx - push esi - mov ebx, eax - mov esi, edx -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc2Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc2Begin_x86_64.inc index d0a2ef6..9241845 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc2Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc2Begin_x86_64.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 2-parameter assembly functions. // After inclusion: rcx = param1, rdx = param2 (MS x64 ABI). -// On FPC non-Windows (Unix ABI), remaps rdi,rsi -> rcx,rdx. +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi,rsi -> rcx,rdx. // Usage: // procedure MyProc(P1, P2: Pointer); // {$I SimdProc2Begin_x86_64.inc} @@ -8,12 +8,12 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm - {$IFNDEF MSWINDOWS} +{$IFNDEF FPC} + .noframe +{$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} mov rdx, rsi mov rcx, rdi - {$ENDIF} -{$ELSE} -asm - .noframe {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc3Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc3Begin_i386.inc index 0905e4e..6e64d81 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc3Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc3Begin_i386.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 3-parameter assembly functions. // After inclusion: ebx = param1, esi = param2, edi = param3 (EAX, EDX, ECX at entry). -// FPC: Windows and Unix i386 use the same parameter entry layout. +// Callee-saved EBX, ESI, EDI - epilogue must pop EDI, ESI, EBX. // Usage: // procedure MyProc(P1, P2, P3: Pointer); // {$I SimdProc3Begin_i386.inc} @@ -8,6 +8,7 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm push ebx push esi @@ -15,12 +16,3 @@ asm mov ebx, eax mov esi, edx mov edi, ecx -{$ELSE} -asm - push ebx - push esi - push edi - mov ebx, eax - mov esi, edx - mov edi, ecx -{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc3Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc3Begin_x86_64.inc index 1be2d2d..72a725a 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc3Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc3Begin_x86_64.inc @@ -1,6 +1,6 @@ // Shared SIMD procedure prologue for 3-parameter assembly functions. // After inclusion: rcx = param1, rdx = param2, r8 = param3 (MS x64 ABI). -// On FPC non-Windows (Unix ABI), remaps rdi,rsi,rdx -> rcx,rdx,r8. +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi,rsi,rdx -> rcx,rdx,r8. // Move order avoids register clobbering: save rdx before overwriting. // Usage: // procedure MyProc(P1, P2, P3: Pointer); @@ -9,13 +9,13 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm - {$IFNDEF MSWINDOWS} +{$IFNDEF FPC} + .noframe +{$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} mov r8, rdx mov rdx, rsi mov rcx, rdi - {$ENDIF} -{$ELSE} -asm - .noframe {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc4Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc4Begin_i386.inc index 1caaef1..f1643b3 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc4Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc4Begin_i386.inc @@ -1,8 +1,10 @@ // Shared SIMD procedure prologue for 4-parameter assembly functions. // After inclusion: ebx = param1, esi = param2, edi = param3, eax = param4 -// (first three in EAX, EDX, ECX at entry; param4 from stack after callee saves). -// FPC: Windows and Unix i386 use the same stack tail layout; see asm for ESP offsets. -// Delphi: MS Windows uses [ebp+8] for param4; see asm for other Delphi targets. +// (first three in EAX, EDX, ECX at entry; param4 from stack). +// Callee-saved EBX, ESI, EDI - epilogue must pop EDI, ESI, EBX. +// Delphi i386 only ships on Windows; param4 lives at [ebp + 8]. +// FPC i386 has no stack frame; param4 lives at [esp + 8] when read after +// the EBX push but before the ESI / EDI pushes (see asm). // Usage: // procedure MyProc(P1, P2, P3: Pointer; P4: Int32); // {$I SimdProc4Begin_i386.inc} @@ -20,7 +22,6 @@ asm mov edi, ecx {$ELSE} asm - {$IFDEF MSWINDOWS} push ebx mov ebx, eax mov eax, dword ptr [ebp + 8] @@ -28,13 +29,4 @@ asm mov esi, edx push edi mov edi, ecx - {$ELSE} - push ebx - push esi - push edi - mov ebx, dword ptr [ebp + 8] - mov esi, dword ptr [ebp + 12] - mov edi, dword ptr [ebp + 16] - mov eax, dword ptr [ebp + 20] - {$ENDIF} {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc4Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc4Begin_x86_64.inc index be70bb1..c40d788 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc4Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc4Begin_x86_64.inc @@ -1,7 +1,7 @@ // Shared SIMD procedure prologue for 4-parameter assembly functions. // After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4 // (MS x64 ABI). -// On FPC non-Windows (Unix ABI), remaps rdi,rsi,rdx,rcx -> rcx,rdx,r8,r9. +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi,rsi,rdx,rcx -> rcx,rdx,r8,r9. // Move order avoids register clobbering: save rcx and rdx first. // Usage: // procedure MyProc(P1, P2, P3: Pointer; P4: Int32); @@ -10,14 +10,14 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm - {$IFNDEF MSWINDOWS} +{$IFNDEF FPC} + .noframe +{$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} mov r9, rcx mov r8, rdx mov rdx, rsi mov rcx, rdi - {$ENDIF} -{$ELSE} -asm - .noframe {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc5Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc5Begin_i386.inc index 7092f3d..3b716b3 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc5Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc5Begin_i386.inc @@ -2,7 +2,7 @@ // After inclusion: ebx = param1, esi = param2, edi = param3, eax = param4, ecx = param5 // (i386 register convention). // FPC: param4 and param5 are at [esp+20] and [esp+16] after push EBX/ESI/EDI (see asm). -// Delphi: MS Windows uses [ebp+12] and [ebp+8] for param4 and param5; see asm for other Delphi targets. +// Delphi Win32: first 3 params in EAX/EDX/ECX; param4 at [ebp+12], param5 at [ebp+8]. // Callee-saved EBX, ESI, EDI — epilogue must pop EDI, ESI, EBX. // Usage: // procedure MyProc(P1, P2: Pointer; P3, P4: Int32; P5: Pointer); @@ -22,7 +22,6 @@ asm mov ecx, dword ptr [esp + 16] // param5 {$ELSE} asm - {$IFDEF MSWINDOWS} push ebx push esi push edi @@ -31,14 +30,4 @@ asm mov edi, ecx mov eax, dword ptr [ebp + 12] // param4 (first stack param = high address) mov ecx, dword ptr [ebp + 8] // param5 (last stack param) - {$ELSE} - push ebx - push esi - push edi - mov ebx, dword ptr [ebp + 8] - mov esi, dword ptr [ebp + 12] - mov edi, dword ptr [ebp + 16] - mov eax, dword ptr [ebp + 20] - mov ecx, dword ptr [ebp + 24] - {$ENDIF} {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc5Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc5Begin_x86_64.inc index ffa5485..82efa29 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc5Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc5Begin_x86_64.inc @@ -1,9 +1,8 @@ // Shared SIMD procedure prologue for 5-parameter assembly functions. // After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4, r10 = param5 // (MS x64 ABI). -// On FPC non-Windows (Unix ABI), remaps rdi,rsi,rdx,rcx,r8 -> rcx,rdx,r8,r9,r10. -// Move order avoids register clobbering. -// On MS x64, param5 is loaded from [rsp+40] (after shadow space). +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi,rsi,rdx,rcx,r8 -> rcx,rdx,r8,r9,r10. +// On MS x64, param5 is loaded from [rsp+40] (above 32-byte shadow space + 8-byte return addr). // Usage: // procedure MyProc(P1, P2: Pointer; P3, P4: Int32; P5: Pointer); // {$I SimdProc5Begin_x86_64.inc} @@ -11,18 +10,17 @@ // end; {$IFDEF FPC} assembler; nostackframe; +{$ENDIF} asm - {$IFNDEF MSWINDOWS} +{$IFNDEF FPC} + .noframe +{$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} mov r10, r8 mov r9, rcx mov r8, rdx mov rdx, rsi mov rcx, rdi - {$ELSE} - mov r10, [rsp + 40] - {$ENDIF} {$ELSE} -asm - .noframe mov r10, [rsp + 40] {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc6Begin_i386.inc b/HashLib/src/Include/Simd/Common/SimdProc6Begin_i386.inc index bf75ef7..d578d92 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc6Begin_i386.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc6Begin_i386.inc @@ -2,7 +2,8 @@ // After inclusion: ebx = param1, esi = param2, edi = param3, eax = param4, // ecx = param5 low, edx = param5 high (UInt64 split); param6 is not loaded in this prologue. // FPC: param4 at [esp+28]; param5 at [esp+20]/[esp+24]; param6 at [esp+16] after push EBX/ESI/EDI (see asm). -// Delphi: MS Windows: [ebp+20]=param4, [ebp+12]/[ebp+16]=param5, [ebp+8]=param6; see asm for other Delphi targets. +// Delphi Win32: first 3 params in EAX/EDX/ECX; param4 at [ebp+20], +// param5 at [ebp+12]/[ebp+16], param6 at [ebp+8]. // Callee-saved EBX, ESI, EDI — epilogue must pop EDI, ESI, EBX; SIMD bodies may adjust ESP for locals. // Usage: // procedure MyProc(P1, P2, P3: Pointer; P4: Int32; P5: UInt64; P6: UInt32); @@ -23,7 +24,6 @@ asm mov edx, dword ptr [esp + 24] // param5_hi {$ELSE} asm - {$IFDEF MSWINDOWS} push ebx push esi push edi @@ -33,15 +33,4 @@ asm mov eax, dword ptr [ebp + 20] // param4 (first stack param = high address) mov ecx, dword ptr [ebp + 12] // param5_lo mov edx, dword ptr [ebp + 16] // param5_hi - {$ELSE} - push ebx - push esi - push edi - mov ebx, eax - mov esi, edx - mov edi, ecx - mov eax, dword ptr [ebp + 20] - mov ecx, dword ptr [ebp + 12] - mov edx, dword ptr [ebp + 16] - {$ENDIF} {$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc6Begin_x86_64.inc b/HashLib/src/Include/Simd/Common/SimdProc6Begin_x86_64.inc index 2e3ac81..820486b 100644 --- a/HashLib/src/Include/Simd/Common/SimdProc6Begin_x86_64.inc +++ b/HashLib/src/Include/Simd/Common/SimdProc6Begin_x86_64.inc @@ -1,9 +1,9 @@ // Shared SIMD procedure prologue for 6-parameter assembly functions. // After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4, // r10 = param5, r11 = param6 (MS x64 ABI layout). -// On FPC non-Windows (Unix ABI), remaps rdi,rsi,rdx,rcx,r8,r9 -> -// rcx,rdx,r8,r9,r10,r11. Move order avoids register clobbering. -// On MS x64, param5/6 are loaded from [rsp+40]/[rsp+48] (after shadow space). +// When HASHLIB_SYSV_X64_ABI is defined, remaps rdi,rsi,rdx,rcx,r8,r9 -> +// rcx,rdx,r8,r9,r10,r11. +// On MS x64, param5/6 are loaded from [rsp+40]/[rsp+48] (above shadow space + return addr). // Usage: // procedure MyProc(P1, P2, P3: Pointer; P4: Int32; P5: UInt64; P6: UInt32); // {$I SimdProc6Begin_x86_64.inc} @@ -11,21 +11,19 @@ // end; {$IFDEF FPC} assembler; nostackframe; - asm - {$IFNDEF MSWINDOWS} +{$ENDIF} +asm +{$IFNDEF FPC} + .noframe +{$ENDIF} +{$IFDEF HASHLIB_SYSV_X64_ABI} mov r11, r9 mov r10, r8 mov r9, rcx mov r8, rdx mov rdx, rsi mov rcx, rdi - {$ELSE} - mov r10, [rsp + 40] - mov r11, [rsp + 48] - {$ENDIF} {$ELSE} - asm - .noframe mov r10, [rsp + 40] mov r11, [rsp + 48] {$ENDIF} diff --git a/HashLib/src/Include/Simd/CpuFeatures/CpuIdQuery.inc b/HashLib/src/Include/Simd/CpuFeatures/CpuIdQuery.inc index f633d77..7e0af22 100644 --- a/HashLib/src/Include/Simd/CpuFeatures/CpuIdQuery.inc +++ b/HashLib/src/Include/Simd/CpuFeatures/CpuIdQuery.inc @@ -1,8 +1,10 @@ // CPUID query: executes CPUID with leaf=ALeaf, subleaf=ASubLeaf, // stores EAX/EBX/ECX/EDX into the TCpuIdResult record at AResult. -// IA-32 (HASHLIB_I386_ASM): preserves EBX; ESI = AResult, ECX = ASubLeaf, EAX = ALeaf. -// x86-64: preserves RBX; MS ABI ecx=ALeaf, edx=ASubLeaf, r8=AResult; -// FPC non-Windows remaps edi, esi, rdx -> leaf, subleaf, result. +// IA-32 fastcall entry: EAX=ALeaf, EDX=ASubLeaf, ECX=AResult. +// x86-64 entry (per OS ABI): +// Windows (MS ABI): RCX=ALeaf, RDX=ASubLeaf, R8=AResult. +// Unix (SysV ABI): RDI=ALeaf, RSI=ASubLeaf, RDX=AResult. +// CPUID clobbers EBX, so RBX is preserved across the call. {$IFDEF FPC} assembler; nostackframe; {$ENDIF} @@ -10,9 +12,9 @@ asm {$IFDEF HASHLIB_I386_ASM} push ebx push esi - mov esi, ecx - mov ecx, edx - cpuid + mov esi, ecx // save AResult (was in ECX) across cpuid + mov ecx, edx // ECX := ASubLeaf + cpuid // EAX already holds ALeaf mov [esi], eax mov [esi + 4], ebx mov [esi + 8], ecx @@ -20,35 +22,29 @@ asm pop esi pop ebx {$ELSE} + // x86-64: normalize to EAX=ALeaf, ECX=ASubLeaf, R8=AResult, then cpuid. + // Preserve RBX (CPUID clobbers it). {$IFDEF FPC} push rbx - {$IFDEF MSWINDOWS} - mov eax, ecx - mov ecx, edx - cpuid - mov dword ptr [r8], eax - mov dword ptr [r8 + 4], ebx - mov dword ptr [r8 + 8], ecx - mov dword ptr [r8 + 12], edx {$ELSE} - mov eax, edi - mov ecx, esi - mov r8, rdx - cpuid - mov dword ptr [r8], eax - mov dword ptr [r8 + 4], ebx - mov dword ptr [r8 + 8], ecx - mov dword ptr [r8 + 12], edx + .PUSHNV RBX {$ENDIF} - pop rbx + {$IFDEF HASHLIB_SYSV_X64_ABI} + // SysV ABI entry: RDI=ALeaf, RSI=ASubLeaf, RDX=AResult. + mov eax, edi // EAX := ALeaf + mov ecx, esi // ECX := ASubLeaf + mov r8, rdx // R8 := AResult {$ELSE} - .PUSHNV RBX - mov eax, ecx - mov ecx, edx + // MS ABI entry: RCX=ALeaf, RDX=ASubLeaf, R8=AResult. + mov eax, ecx // EAX := ALeaf + mov ecx, edx // ECX := ASubLeaf + {$ENDIF} cpuid mov dword ptr [r8], eax mov dword ptr [r8 + 4], ebx mov dword ptr [r8 + 8], ecx mov dword ptr [r8 + 12], edx + {$IFDEF FPC} + pop rbx {$ENDIF} {$ENDIF} diff --git a/HashLib/src/Include/Simd/CpuFeatures/XGetBvQuery.inc b/HashLib/src/Include/Simd/CpuFeatures/XGetBvQuery.inc index 97ac9b0..726dace 100644 --- a/HashLib/src/Include/Simd/CpuFeatures/XGetBvQuery.inc +++ b/HashLib/src/Include/Simd/CpuFeatures/XGetBvQuery.inc @@ -1,36 +1,36 @@ // XGETBV query: executes XGETBV with ECX=0, stores EAX:EDX into the // UInt64 at AResult. -// IA-32: AResult in EAX at asm entry (Windows and Unix). -// x86-64: MS ABI rcx=AResult; FPC non-Windows remaps rdi -> r8. +// IA-32 fastcall entry: EAX=AResult. +// x86-64 entry (per OS ABI): +// Windows (MS ABI): RCX=AResult. +// Unix (SysV ABI): RDI=AResult. +// XGETBV does not clobber RBX, so no preservation is needed on x86-64. {$IFDEF FPC} assembler; nostackframe; {$ENDIF} asm {$IFDEF HASHLIB_I386_ASM} push ebx - mov ebx, eax - xor ecx, ecx - db $0F, $01, $D0 // xgetbv + mov ebx, eax // save AResult (was in EAX) across xgetbv + xor ecx, ecx // XCR index 0 + db $0F, $01, $D0 // xgetbv -> EDX:EAX mov [ebx], eax mov [ebx + 4], edx pop ebx {$ELSE} - {$IFDEF FPC} - {$IFDEF MSWINDOWS} - mov r8, rcx - {$ELSE} - mov r8, rdi + // x86-64: normalize to R8=AResult, then xgetbv. + {$IFNDEF FPC} + .noframe {$ENDIF} - xor ecx, ecx - db $0F, $01, $D0 // xgetbv - mov dword ptr [r8], eax - mov dword ptr [r8 + 4], edx + {$IFDEF HASHLIB_SYSV_X64_ABI} + // SysV ABI entry: RDI=AResult. + mov r8, rdi // R8 := AResult {$ELSE} - .noframe - mov r8, rcx - xor ecx, ecx - db $0F, $01, $D0 // xgetbv + // MS ABI entry: RCX=AResult. + mov r8, rcx // R8 := AResult + {$ENDIF} + xor ecx, ecx // XCR index 0 + db $0F, $01, $D0 // xgetbv -> EDX:EAX mov dword ptr [r8], eax mov dword ptr [r8 + 4], edx - {$ENDIF} {$ENDIF} diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressAvx2_x86_64.inc index f50a38a..6f2e923 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressAvx2_x86_64.inc @@ -16,12 +16,10 @@ sub rsp, 440 -{$IFDEF MSWINDOWS} db $C5, $FA, $7F, $34, $24 // vmovdqu oword [rsp], xmm6 db $C5, $FA, $7F, $7C, $24, $10 // vmovdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -439,12 +437,10 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] db $C5, $FA, $6F, $34, $24 // vmovdqu xmm6, oword [rsp] db $C5, $FA, $6F, $7C, $24, $10 // vmovdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 440 db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressShaNi_x86_64.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressShaNi_x86_64.inc index 783e3b8..c0dc8b9 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressShaNi_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressShaNi_x86_64.inc @@ -2,7 +2,7 @@ // Expects MS x64 ABI: rcx = state ptr, rdx = data ptr, r8d = numblocks, r9 = K_SHA1 ptr. // K_SHA1 layout: K_00_19 at 0, K_20_39 at 16, K_40_59 at 32, // K_60_79 at 48, BSWAP32 mask (16 bytes) at offset 64. -// Uses xmm0-xmm8. Non-volatile xmm6-xmm8 saved/restored on Windows. +// Uses xmm0-xmm8; xmm6-xmm8 are MS x64 non-volatile (saved/restored). // Reference: Intel SHA Extensions whitepaper. // // SHA-NI instruction db encodings (for assembler compatibility): @@ -22,12 +22,10 @@ // xmm7 = BSWAP mask, then E_SAVE // xmm8 = ABCD_SAVE -{$IFDEF MSWINDOWS} sub rsp, 56 movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 -{$ENDIF} test r8d, r8d jz @sha1ni_done @@ -230,9 +228,7 @@ @sha1ni_done: -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] add rsp, 56 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_i386.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_i386.inc index 0b5ecfc..a02d68a 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_i386.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_i386.inc @@ -4,7 +4,7 @@ // (pshuflw/pshufhw + psrlw/psllw/por) instead of SSSE3 pshufb. // IA-32: after SimdProc3Begin_i386 — ebx = state, esi = data, edi = numblocks // (parallel to MS x64 ABI: rcx, rdx, r8d; K constants are inlined here — x64 SSE2 passes K_SHA1 via r9). -// Uses xmm0–xmm6. Non-volatile xmm6 saved/restored on Windows (MSWINDOWS). +// Uses xmm0–xmm6; xmm6 saved/restored defensively (volatile on i386). // // Two phases per block: // Phase 1 (SIMD): SSE2 byte-swap W[0..15], then expand W[16..79] (same as SSSE3 schedule). @@ -16,9 +16,7 @@ sub esp, 424 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 -{$ENDIF} mov [esp + $10], ebx mov [esp + $14], esi mov [esp + $18], edi @@ -460,9 +458,7 @@ @sse2_sha1_done: mov ebp, [esp + $38] -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] -{$ENDIF} add esp, 424 pop edi pop esi diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_x86_64.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_x86_64.inc index 4fa5a00..2a15325 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressSse2_x86_64.inc @@ -12,16 +12,14 @@ // using vectorized message schedule (same as SSSE3). // Phase 2 (GPR): 80 compression rounds with 5-round unrolling // -// Non-volatile register saves: -// Windows: xmm6, rbx, rbp, rdi, rsi, r12-r15 -// Unix: rbx, rbp, r12-r15 +// MS x64 non-volatile saves: xmm6, rbx, rbp, rdi, rsi, r12-r15. // // Stack layout (sub rsp, 424): -// [rsp + 0.. 15]: xmm6 save (Windows only) +// [rsp + 0.. 15]: xmm6 save // [rsp + 16.. 23]: rbx save // [rsp + 24.. 31]: rbp save -// [rsp + 32.. 39]: rdi save (Windows only) -// [rsp + 40.. 47]: rsi save (Windows only) +// [rsp + 32.. 39]: rdi save +// [rsp + 40.. 47]: rsi save // [rsp + 48.. 55]: r12 save // [rsp + 56.. 63]: r13 save // [rsp + 64.. 71]: r14 save @@ -32,11 +30,9 @@ sub rsp, 424 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 mov [rsp + $20], rdi mov [rsp + $28], rsi -{$ENDIF} mov [rsp + $10], rbx mov [rsp + $18], rbp mov [rsp + $30], r12 @@ -471,9 +467,7 @@ mov r13, [rsp + $38] mov r14, [rsp + $40] mov r15, [rsp + $48] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $20] mov rsi, [rsp + $28] movdqu xmm6, oword [rsp] -{$ENDIF} add rsp, 424 diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_i386.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_i386.inc index ccd21aa..d57ade7 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_i386.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_i386.inc @@ -2,22 +2,21 @@ // After SimdProc4Begin_i386 — ebx = state, esi = data, edi = numblocks, eax = K_SHA1 ptr. // K_SHA1 layout matches x64: K_00_19..K_60_79 (64B), BSWAP32 mask at offset 64 ($40). // Phase 1 matches SHA1CompressSsse3_x86_64.inc (pshufb + same expand); Phase 2 matches SSE2 i386 GPR rounds. -// Uses xmm0–xmm7 in Phase 1; xmm7 = BSWAP mask. Non-volatile xmm6–xmm7 saved on Windows. +// Uses xmm0–xmm7 in Phase 1; xmm7 = BSWAP mask. +// xmm6–xmm7 saved/restored defensively (volatile on i386). // // Two phases per block: // Phase 1 (SIMD): pshufb W[0..15], then expand W[16..79] (same as x64 SSSE3 schedule). // Phase 2 (GPR): 80 compression rounds with 5-round unrolling // // Saved GPR / stack (sub esp, 424): [esp+$10] state, +$14 data, +$18 numblocks, +$34 K_SHA1, -// +$1C E, +$20 loop ctr, +$38 saved ebp; W buffer at +$60. Win: xmm6 at [esp], xmm7 at [esp+$24]. +// +$1C E, +$20 loop ctr, +$38 saved ebp; W buffer at +$60. xmm6 at [esp], xmm7 at [esp+$24]. // W-buffer uses movdqu (not movdqa): this frame does not keep ESP 16-byte aligned. sub esp, 424 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $24], xmm7 -{$ENDIF} mov [esp + $10], ebx mov [esp + $14], esi mov [esp + $18], edi @@ -441,10 +440,8 @@ @ssse3_sha1_done: mov ebp, [esp + $38] -{$IFDEF MSWINDOWS} movdqu xmm7, oword ptr [esp + $24] movdqu xmm6, oword ptr [esp] -{$ENDIF} add esp, 424 pop edi pop esi diff --git a/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_x86_64.inc b/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_x86_64.inc index 8b0ba60..c645241 100644 --- a/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA1/SHA1CompressSsse3_x86_64.inc @@ -13,17 +13,15 @@ // W[32..79]: ROL2 expanded recurrence W[t]=ROL2(W[t-6]^W[t-16]^W[t-28]^W[t-32]). // Phase 2 (GPR): 80 compression rounds with 5-round unrolling // -// Non-volatile register saves: -// Windows: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15 -// Unix: rbx, rbp, r12-r15 +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. // // Stack layout (sub rsp, 440): -// [rsp + 0.. 15]: xmm6 save (Windows only) -// [rsp + 16.. 31]: xmm7 save (Windows only) +// [rsp + 0.. 15]: xmm6 save +// [rsp + 16.. 31]: xmm7 save // [rsp + 32.. 39]: rbx save // [rsp + 40.. 47]: rbp save -// [rsp + 48.. 55]: rdi save (Windows only) -// [rsp + 56.. 63]: rsi save (Windows only) +// [rsp + 48.. 55]: rdi save +// [rsp + 56.. 63]: rsi save // [rsp + 64.. 71]: r12 save // [rsp + 72.. 79]: r13 save // [rsp + 80.. 87]: r14 save @@ -34,12 +32,10 @@ sub rsp, 440 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -460,10 +456,8 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 440 diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressAvx2_x86_64.inc index 30faad4..23d5323 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressAvx2_x86_64.inc @@ -9,13 +9,15 @@ // Phase 1 (VEX-128 SIMD): Compute W+K[0..63] using message schedule, store to stack // Phase 2 (GPR): Run 64 compression rounds reading W+K from stack // +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. +// // Stack layout (sub rsp, 376): same as SSSE3 version -// [rsp + 0.. 15]: xmm6 save (Windows only) -// [rsp + 16.. 31]: xmm7 save (Windows only) +// [rsp + 0.. 15]: xmm6 save +// [rsp + 16.. 31]: xmm7 save // [rsp + 32.. 39]: rbx save // [rsp + 40.. 47]: rbp save -// [rsp + 48.. 55]: rdi save (Windows only) -// [rsp + 56.. 63]: rsi save (Windows only) +// [rsp + 48.. 55]: rdi save +// [rsp + 56.. 63]: rsi save // [rsp + 64.. 71]: r12 save // [rsp + 72.. 79]: r13 save // [rsp + 80.. 87]: r14 save @@ -26,12 +28,10 @@ sub rsp, 376 -{$IFDEF MSWINDOWS} db $C5, $FA, $7F, $34, $24 // vmovdqu oword [rsp], xmm6 db $C5, $FA, $7F, $7C, $24, $10 // vmovdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -445,12 +445,10 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] db $C5, $FA, $6F, $34, $24 // vmovdqu xmm6, oword [rsp] db $C5, $FA, $6F, $7C, $24, $10 // vmovdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 376 db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressShaNi_x86_64.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressShaNi_x86_64.inc index 223a2ed..ee7791d 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressShaNi_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressShaNi_x86_64.inc @@ -1,7 +1,7 @@ // SHA-256 SHA-NI implementation. // Expects MS x64 ABI: rcx = state ptr, rdx = data ptr, r8d = numblocks, r9 = K256 ptr. // K256 layout: 64 UInt32 round constants (256 bytes), then BSWAP mask (16 bytes) at offset 256. -// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows. +// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored). // Reference: Jeffrey Walton / Sean Gulley / Intel SHA Extensions whitepaper. // // SHA-NI instruction db encodings (for assembler compatibility): @@ -21,13 +21,11 @@ // xmm8 = ABEF_SAVE // xmm9 = CDGH_SAVE -{$IFDEF MSWINDOWS} sub rsp, 72 movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 movdqu oword [rsp + $20], xmm8 movdqu oword [rsp + $30], xmm9 -{$ENDIF} test r8d, r8d jz @shani_done @@ -252,10 +250,8 @@ @shani_done: -{$IFDEF MSWINDOWS} movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] movdqu xmm8, oword [rsp + $20] movdqu xmm9, oword [rsp + $30] add rsp, 72 -{$ENDIF} diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_i386.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_i386.inc index bc1c0f4..cc8b976 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_i386.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_i386.inc @@ -6,7 +6,7 @@ // IA-32: after SimdProc4Begin_i386 — ebx = state, esi = data, edi = numblocks, eax = K256 ptr // (parallel to MS x64 ABI: rcx, rdx, r8d, r9). // K256 layout: 64 UInt32 round constants (256 bytes), then BSWAP mask (16 bytes) at offset 256. -// Uses xmm0–xmm7. Non-volatile xmm6–xmm7 saved/restored on Windows (MSWINDOWS). +// Uses xmm0–xmm7; xmm6–xmm7 saved/restored defensively (volatile on i386). // // Two-phase per block: // Phase 1 (SIMD): Compute W+K[0..63] using SSE2 message schedule, store to stack @@ -17,10 +17,8 @@ sub esp, 376 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 -{$ENDIF} mov [esp + $20], ebx mov [esp + $24], esi mov [esp + $28], edi @@ -506,10 +504,8 @@ @sse2_256_done: mov ebp, [esp + $30] -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] -{$ENDIF} add esp, 376 pop edi pop esi diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_x86_64.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_x86_64.inc index 2c0be2d..8fb37ce 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressSse2_x86_64.inc @@ -11,20 +11,16 @@ // Phase 1 (SIMD): Compute W+K[0..63] using SSE2 message schedule, store to stack // Phase 2 (GPR): Run 64 compression rounds reading W+K from stack // -// Non-volatile register saves: -// Windows: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15 -// Unix: rbx, rbp, r12-r15 +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. // // Stack layout (sub rsp, 376): same as SSSE3 version sub rsp, 376 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -481,10 +477,8 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 376 diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_i386.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_i386.inc index 3e615da..0100fc6 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_i386.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_i386.inc @@ -2,7 +2,7 @@ // After SimdProc4Begin_i386 — ebx = state, esi = data, edi = numblocks, eax = K256 ptr. // Phase 1 matches SHA256CompressSsse3_x86_64.inc (pshufb, palignr); Phase 2 same GPR rounds as SSE2 i386. // K256 layout: 64 UInt32 round constants (256 bytes), then BSWAP mask (16 bytes) at offset 256. -// Uses xmm0–xmm7. Non-volatile xmm6–xmm7 saved/restored on Windows (MSWINDOWS). +// Uses xmm0–xmm7; xmm6–xmm7 saved/restored defensively (volatile on i386). // // Two-phase per block: // Phase 1 (SIMD): Compute W+K[0..63] using SSSE3 message schedule, store to stack @@ -13,10 +13,8 @@ sub esp, 376 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 -{$ENDIF} mov [esp + $20], ebx mov [esp + $24], esi mov [esp + $28], edi @@ -468,10 +466,8 @@ @ssse3_256_done: mov ebp, [esp + $30] -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] -{$ENDIF} add esp, 376 pop edi pop esi diff --git a/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_x86_64.inc b/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_x86_64.inc index ed6f13a..6bac847 100644 --- a/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA256/SHA256CompressSsse3_x86_64.inc @@ -7,17 +7,15 @@ // Phase 1 (SIMD): Compute W+K[0..63] using SSSE3 message schedule, store to stack // Phase 2 (GPR): Run 64 compression rounds reading W+K from stack // -// Non-volatile register saves: -// Windows: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15 -// Unix: rbx, rbp, r12-r15 +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. // // Stack layout (sub rsp, 376): -// [rsp + 0.. 15]: xmm6 save (Windows only) -// [rsp + 16.. 31]: xmm7 save (Windows only) +// [rsp + 0.. 15]: xmm6 save +// [rsp + 16.. 31]: xmm7 save // [rsp + 32.. 39]: rbx save // [rsp + 40.. 47]: rbp save -// [rsp + 48.. 55]: rdi save (Windows only) -// [rsp + 56.. 63]: rsi save (Windows only) +// [rsp + 48.. 55]: rdi save +// [rsp + 56.. 63]: rsi save // [rsp + 64.. 71]: r12 save // [rsp + 72.. 79]: r13 save // [rsp + 80.. 87]: r14 save @@ -28,12 +26,10 @@ sub rsp, 376 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -463,10 +459,8 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 376 diff --git a/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2Absorb_x86_64.inc b/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2Absorb_x86_64.inc index 6d815f3..25ec748 100644 --- a/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2Absorb_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2Absorb_x86_64.inc @@ -24,12 +24,15 @@ // ymm5 = { state[21], state[17], state[13], state[9] } // ymm6 = { state[6], state[12], state[18], state[24] } // +// MS x64 non-volatile saves: xmm6-xmm15, rbx, rbp, r12-r15, rdi, rsi. +// rdi/rsi are also pushed under non-MS ABIs because the absorb loop uses rsi +// as the A_JAGGED base pointer for the duration of the routine. +// // Stack layout (sub rsp, 424): -// [rsp+0..223] : jagged buffer (7 x 32 bytes) -// [rsp+224..255] : scratch -// [rsp+256..415] : save xmm6-xmm15 (Windows only) +// [rsp+ 0..223]: jagged buffer (7 x 32 bytes) +// [rsp+224..255]: scratch +// [rsp+256..415]: xmm6-xmm15 save area // -// Callee-saved GPRs (pushed): rbx, rbp, r12-r15, rdi, rsi (Win) / rbx, rbp, r12-r15 (Linux) // Register allocation across absorb loop: // r12 = state pointer (for final scatter) // r13 = data pointer (advances per block) @@ -37,7 +40,7 @@ // r15d = block size in bytes (constant) // rbx = constants pointer (constant) // ebp = block size in qwords (constant) -// rsi = A_JAGGED base pointer (constant, Windows callee-saved) +// rsi = A_JAGGED base pointer (constant) // ===== Prologue: save callee-saved GPRs ===== push rbx @@ -46,15 +49,12 @@ push r13 push r14 push r15 -{$IFDEF MSWINDOWS} push rdi push rsi -{$ENDIF} sub rsp, 424 - // Save XMM6-XMM15 (Windows only) -{$IFDEF MSWINDOWS} + // Save XMM6-XMM15 db $C5, $FA, $7F, $B4, $24, $00, $01, $00, $00 // vmovdqu [rsp+256], xmm6 db $C5, $FA, $7F, $BC, $24, $10, $01, $00, $00 // vmovdqu [rsp+272], xmm7 db $C4, $61, $7A, $7F, $84, $24, $20, $01, $00, $00 // vmovdqu [rsp+288], xmm8 @@ -65,7 +65,6 @@ db $C4, $61, $7A, $7F, $AC, $24, $70, $01, $00, $00 // vmovdqu [rsp+368], xmm13 db $C4, $61, $7A, $7F, $B4, $24, $80, $01, $00, $00 // vmovdqu [rsp+384], xmm14 db $C4, $61, $7A, $7F, $BC, $24, $90, $01, $00, $00 // vmovdqu [rsp+400], xmm15 -{$ENDIF} // Save parameters to callee-saved registers mov r12, rcx @@ -402,8 +401,7 @@ // ===== Epilogue ===== db $C5, $F8, $77 // vzeroupper - // Restore XMM6-XMM15 (Windows only) -{$IFDEF MSWINDOWS} + // Restore XMM6-XMM15 db $C5, $FA, $6F, $B4, $24, $00, $01, $00, $00 // vmovdqu xmm6, [rsp+256] db $C5, $FA, $6F, $BC, $24, $10, $01, $00, $00 // vmovdqu xmm7, [rsp+272] db $C4, $61, $7A, $6F, $84, $24, $20, $01, $00, $00 // vmovdqu xmm8, [rsp+288] @@ -414,14 +412,11 @@ db $C4, $61, $7A, $6F, $AC, $24, $70, $01, $00, $00 // vmovdqu xmm13, [rsp+368] db $C4, $61, $7A, $6F, $B4, $24, $80, $01, $00, $00 // vmovdqu xmm14, [rsp+384] db $C4, $61, $7A, $6F, $BC, $24, $90, $01, $00, $00 // vmovdqu xmm15, [rsp+400] -{$ENDIF} add rsp, 424 -{$IFDEF MSWINDOWS} pop rsi pop rdi -{$ENDIF} pop r15 pop r14 pop r13 diff --git a/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2_x86_64.inc b/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2_x86_64.inc index dcec508..80d7ec3 100644 --- a/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA3/KeccakF1600Avx2_x86_64.inc @@ -17,14 +17,15 @@ // ymm5 = { state[21], state[17], state[13], state[9] } // ymm6 = { state[6], state[12], state[18], state[24] } // +// MS x64 non-volatile saves: xmm6-xmm15. +// // Stack layout (sub rsp, 224): -// [rsp+0..31] : 32-byte temp buffer for gather/scatter -// [rsp+32..191] : save xmm6-xmm15 (Windows only) +// [rsp+ 0.. 31]: 32-byte temp buffer for gather/scatter +// [rsp+ 32..191]: xmm6-xmm15 save area // [rsp+192..223]: padding sub rsp, 224 -{$IFDEF MSWINDOWS} db $C5, $FA, $7F, $74, $24, $20 // vmovdqu [rsp+32], xmm6 db $C5, $FA, $7F, $7C, $24, $30 // vmovdqu [rsp+48], xmm7 db $C4, $61, $7A, $7F, $44, $24, $40 // vmovdqu [rsp+64], xmm8 @@ -35,7 +36,6 @@ db $C4, $61, $7A, $7F, $AC, $24, $90, $00, $00, $00 // vmovdqu [rsp+144], xmm13 db $C4, $61, $7A, $7F, $B4, $24, $A0, $00, $00, $00 // vmovdqu [rsp+160], xmm14 db $C4, $61, $7A, $7F, $BC, $24, $B0, $00, $00, $00 // vmovdqu [rsp+176], xmm15 -{$ENDIF} // Set up table pointers (biased by +96 for compact displacements) lea r8, [rdx + 96] @@ -281,7 +281,6 @@ db $C5, $F8, $77 // vzeroupper -{$IFDEF MSWINDOWS} db $C5, $FA, $6F, $74, $24, $20 // vmovdqu xmm6, [rsp+32] db $C5, $FA, $6F, $7C, $24, $30 // vmovdqu xmm7, [rsp+48] db $C4, $61, $7A, $6F, $44, $24, $40 // vmovdqu xmm8, [rsp+64] @@ -292,6 +291,5 @@ db $C4, $61, $7A, $6F, $AC, $24, $90, $00, $00, $00 // vmovdqu xmm13, [rsp+144] db $C4, $61, $7A, $6F, $B4, $24, $A0, $00, $00, $00 // vmovdqu xmm14, [rsp+160] db $C4, $61, $7A, $6F, $BC, $24, $B0, $00, $00, $00 // vmovdqu xmm15, [rsp+176] -{$ENDIF} add rsp, 224 diff --git a/HashLib/src/Include/Simd/SHA512/SHA512CompressAvx2_x86_64.inc b/HashLib/src/Include/Simd/SHA512/SHA512CompressAvx2_x86_64.inc index 3fe14db..f5f04dc 100644 --- a/HashLib/src/Include/Simd/SHA512/SHA512CompressAvx2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA512/SHA512CompressAvx2_x86_64.inc @@ -5,13 +5,15 @@ // Expects MS x64 ABI: rcx = state ptr, rdx = data ptr, r8d = numblocks, r9 = K512 ptr. // K512 layout: 80 UInt64 round constants (640 bytes), then BSWAP64 mask (16 bytes) at offset 640. // +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. +// // Stack layout (sub rsp, 1016): same as SSSE3 version -// [rsp + 0.. 15]: xmm6 save (Windows only) -// [rsp + 16.. 31]: xmm7 save (Windows only) +// [rsp + 0.. 15]: xmm6 save +// [rsp + 16.. 31]: xmm7 save // [rsp + 32.. 39]: rbx save // [rsp + 40.. 47]: rbp save -// [rsp + 48.. 55]: rdi save (Windows only) -// [rsp + 56.. 63]: rsi save (Windows only) +// [rsp + 48.. 55]: rdi save +// [rsp + 56.. 63]: rsi save // [rsp + 64.. 71]: r12 save // [rsp + 72.. 79]: r13 save // [rsp + 80.. 87]: r14 save @@ -23,12 +25,10 @@ sub rsp, 1016 -{$IFDEF MSWINDOWS} db $C5, $FA, $7F, $34, $24 // vmovdqu oword [rsp], xmm6 db $C5, $FA, $7F, $7C, $24, $10 // vmovdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -467,12 +467,10 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] db $C5, $FA, $6F, $34, $24 // vmovdqu xmm6, oword [rsp] db $C5, $FA, $6F, $7C, $24, $10 // vmovdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 1016 db $C5, $F8, $77 // vzeroupper diff --git a/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_i386.inc b/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_i386.inc index 399d029..1b8b413 100644 --- a/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_i386.inc +++ b/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_i386.inc @@ -11,10 +11,8 @@ sub esp, 1016 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 -{$ENDIF} mov dword ptr [esp + $20], ebx mov dword ptr [esp + $24], esi mov dword ptr [esp + $28], edi @@ -1331,10 +1329,8 @@ jnz @sse2_512_block_loop @sse2_512_done: -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] -{$ENDIF} mov ebp, dword ptr [esp + $30] add esp, 1016 pop edi diff --git a/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_x86_64.inc b/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_x86_64.inc index 80e1229..a2b05b6 100644 --- a/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA512/SHA512CompressSse2_x86_64.inc @@ -10,10 +10,8 @@ sub rsp, 1016 -{$IFDEF MSWINDOWS} mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -486,8 +484,6 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] -{$ENDIF} add rsp, 1016 diff --git a/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_i386.inc b/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_i386.inc index 5001b2d..4d4ccd7 100644 --- a/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_i386.inc +++ b/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_i386.inc @@ -11,10 +11,8 @@ sub esp, 1016 -{$IFDEF MSWINDOWS} movdqu oword ptr [esp], xmm6 movdqu oword ptr [esp + $10], xmm7 -{$ENDIF} mov dword ptr [esp + $20], ebx mov dword ptr [esp + $24], esi mov dword ptr [esp + $28], edi @@ -1292,10 +1290,8 @@ jnz @ssse3_512_block_loop @ssse3_512_done: -{$IFDEF MSWINDOWS} movdqu xmm6, oword ptr [esp] movdqu xmm7, oword ptr [esp + $10] -{$ENDIF} mov ebp, dword ptr [esp + $30] add esp, 1016 pop edi diff --git a/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_x86_64.inc b/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_x86_64.inc index bce3db4..5db35f4 100644 --- a/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_x86_64.inc +++ b/HashLib/src/Include/Simd/SHA512/SHA512CompressSsse3_x86_64.inc @@ -7,13 +7,15 @@ // Message schedule expansion W[16..79] and compression rounds use GPR only (64-bit words). // W values are kept in a 16-entry circular buffer on the stack. // +// MS x64 non-volatile saves: xmm6-xmm7, rbx, rbp, rdi, rsi, r12-r15. +// // Stack layout (sub rsp, 1016): -// [rsp + 0.. 15]: xmm6 save (Windows only) -// [rsp + 16.. 31]: xmm7 save (Windows only) +// [rsp + 0.. 15]: xmm6 save +// [rsp + 16.. 31]: xmm7 save // [rsp + 32.. 39]: rbx save // [rsp + 40.. 47]: rbp save -// [rsp + 48.. 55]: rdi save (Windows only) -// [rsp + 56.. 63]: rsi save (Windows only) +// [rsp + 48.. 55]: rdi save +// [rsp + 56.. 63]: rsi save // [rsp + 64.. 71]: r12 save // [rsp + 72.. 79]: r13 save // [rsp + 80.. 87]: r14 save @@ -25,12 +27,10 @@ sub rsp, 1016 -{$IFDEF MSWINDOWS} movdqu oword [rsp], xmm6 movdqu oword [rsp + $10], xmm7 mov [rsp + $30], rdi mov [rsp + $38], rsi -{$ENDIF} mov [rsp + $20], rbx mov [rsp + $28], rbp mov [rsp + $40], r12 @@ -474,10 +474,8 @@ mov r13, [rsp + $48] mov r14, [rsp + $50] mov r15, [rsp + $58] -{$IFDEF MSWINDOWS} mov rdi, [rsp + $30] mov rsi, [rsp + $38] movdqu xmm6, oword [rsp] movdqu xmm7, oword [rsp + $10] -{$ENDIF} add rsp, 1016 diff --git a/HashLib/src/Include/Simd/Scrypt/ScryptSalsa8Sse2_x86_64.inc b/HashLib/src/Include/Simd/Scrypt/ScryptSalsa8Sse2_x86_64.inc index 5bfd695..50911a5 100644 --- a/HashLib/src/Include/Simd/Scrypt/ScryptSalsa8Sse2_x86_64.inc +++ b/HashLib/src/Include/Simd/Scrypt/ScryptSalsa8Sse2_x86_64.inc @@ -12,7 +12,7 @@ // xmm0 = A = {w0,w5,w10,w15}, xmm1 = B = {w4,w9,w14,w3}, // xmm2 = C = {w8,w13,w2,w7}, xmm3 = D = {w12,w1,w6,w11}. // Operation: State = Salsa20/8(State XOR Input) -// Uses xmm0-xmm5 (all volatile on Windows and Unix). No spills needed. +// Uses xmm0-xmm5 (volatile under both ABIs; no saves needed). // Stack: 72 bytes (64 for saved XOR'd state + 8 alignment padding). // // Column QR (lane-parallel): diff --git a/HashLib/src/Utils/HlpX86SimdFeatures.pas b/HashLib/src/Utils/HlpX86SimdFeatures.pas index fe72655..460a158 100644 --- a/HashLib/src/Utils/HlpX86SimdFeatures.pas +++ b/HashLib/src/Utils/HlpX86SimdFeatures.pas @@ -9,12 +9,6 @@ interface type TX86SimdFeatures = class sealed - strict private - type - TCpuIdResult = record - RegEAX, RegEBX, RegECX, RegEDX: UInt32; - end; - strict private class var FActiveSimdLevel: TX86SimdLevel; @@ -35,6 +29,11 @@ TCpuIdResult = record class function CPUHasVPCLMULQDQ(): Boolean; static; class function CPUHasAESNI(): Boolean; static; + // Clears all the "extra" CPU feature flags (SHA-NI, PCLMULQDQ, + // VPCLMULQDQ, AES-NI). Used by ApplyBuildOverrides to give every + // HASHLIB_FORCE_* branch a uniform "no accelerated extras" baseline. + class procedure DisableAllExtraFeatures(); static; + private class procedure ProbeHardwareAndCache(); static; class procedure ApplyBuildOverrides(); static; @@ -67,13 +66,18 @@ TCpuIdResult = record implementation +type + TCpuIdResult = record + RegEAX, RegEBX, RegECX, RegEDX: UInt32; + end; + {$IFDEF HASHLIB_X86_SIMD} -procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; AResult: Pointer); +procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; out AResult: TCpuIdResult); {$I ..\Include\Simd\CpuFeatures\CpuIdQuery.inc} end; -procedure XGetBvQuery(AResult: Pointer); +procedure XGetBvQuery(out AResult: UInt64); {$I ..\Include\Simd\CpuFeatures\XGetBvQuery.inc} end; @@ -88,7 +92,7 @@ class function TX86SimdFeatures.CPUHasSSE2(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); Result := (LCpuId.RegEDX and (1 shl 26)) <> 0; {$ELSE} Result := False; @@ -102,7 +106,7 @@ class function TX86SimdFeatures.CPUHasSSE3(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // SSE3: ECX bit 0 Result := (LCpuId.RegECX and (1 shl 0)) <> 0; {$ELSE} @@ -117,7 +121,7 @@ class function TX86SimdFeatures.CPUHasSSSE3(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // SSSE3: ECX bit 9 Result := (LCpuId.RegECX and (1 shl 9)) <> 0; {$ELSE} @@ -132,7 +136,7 @@ class function TX86SimdFeatures.CPUHasSSE41(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // SSE4.1: ECX bit 19 Result := (LCpuId.RegECX and (1 shl 19)) <> 0; {$ELSE} @@ -147,7 +151,7 @@ class function TX86SimdFeatures.CPUHasSSE42(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // SSE4.2: ECX bit 20 Result := (LCpuId.RegECX and (1 shl 20)) <> 0; {$ELSE} @@ -163,7 +167,7 @@ class function TX86SimdFeatures.CPUHasAVX2(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // OSXSAVE: ECX bit 27 (required for OS AVX state saving) if (LCpuId.RegECX and (1 shl 27)) = 0 then @@ -171,11 +175,11 @@ class function TX86SimdFeatures.CPUHasAVX2(): Boolean; // XCR0 bits 1 and 2 must be set for AVX state support LXcr0 := 0; - XGetBvQuery(@LXcr0); + XGetBvQuery(LXcr0); if (UInt32(LXcr0) and $06) <> $06 then Exit(False); - CpuIdQuery(7, 0, @LCpuId); + CpuIdQuery(7, 0, LCpuId); // AVX2: EBX bit 5 Result := (LCpuId.RegEBX and (1 shl 5)) <> 0; {$ELSE} @@ -190,7 +194,7 @@ class function TX86SimdFeatures.CPUHasSHANI(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(7, 0, @LCpuId); + CpuIdQuery(7, 0, LCpuId); // SHA-NI: EBX bit 29 Result := (LCpuId.RegEBX and (1 shl 29)) <> 0; {$ELSE} @@ -205,7 +209,7 @@ class function TX86SimdFeatures.CPUHasPCLMULQDQ(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // PCLMULQDQ: ECX bit 1 Result := (LCpuId.RegECX and (1 shl 1)) <> 0; {$ELSE} @@ -220,7 +224,7 @@ class function TX86SimdFeatures.CPUHasVPCLMULQDQ(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(7, 0, @LCpuId); + CpuIdQuery(7, 0, LCpuId); // VPCLMULQDQ: ECX bit 10 Result := (LCpuId.RegECX and (1 shl 10)) <> 0; {$ELSE} @@ -235,7 +239,7 @@ class function TX86SimdFeatures.CPUHasAESNI(): Boolean; {$ENDIF} begin {$IFDEF HASHLIB_X86_SIMD} - CpuIdQuery(1, 0, @LCpuId); + CpuIdQuery(1, 0, LCpuId); // AES-NI: ECX bit 25 Result := (LCpuId.RegECX and (1 shl 25)) <> 0; {$ELSE} @@ -243,6 +247,14 @@ class function TX86SimdFeatures.CPUHasAESNI(): Boolean; {$ENDIF} end; +class procedure TX86SimdFeatures.DisableAllExtraFeatures(); +begin + FHasSHANI := False; + FHasPCLMULQDQ := False; + FHasVPCLMULQDQ := False; + FHasAESNI := False; +end; + class procedure TX86SimdFeatures.ProbeHardwareAndCache(); var LHasSSE2, LHasSSE3, LHasSSSE3, LHasSSE41, LHasSSE42, LHasAVX2: Boolean; @@ -282,45 +294,27 @@ class procedure TX86SimdFeatures.ApplyBuildOverrides(); begin {$IF DEFINED(HASHLIB_FORCE_SCALAR)} FActiveSimdLevel := TX86SimdLevel.Scalar; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$ELSEIF DEFINED(HASHLIB_FORCE_SSE2)} if FActiveSimdLevel > TX86SimdLevel.SSE2 then FActiveSimdLevel := TX86SimdLevel.SSE2; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$ELSEIF DEFINED(HASHLIB_FORCE_SSE3)} if FActiveSimdLevel > TX86SimdLevel.SSE3 then FActiveSimdLevel := TX86SimdLevel.SSE3; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$ELSEIF DEFINED(HASHLIB_FORCE_SSSE3)} if FActiveSimdLevel > TX86SimdLevel.SSSE3 then FActiveSimdLevel := TX86SimdLevel.SSSE3; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$ELSEIF DEFINED(HASHLIB_FORCE_SSE41)} if FActiveSimdLevel > TX86SimdLevel.SSE41 then FActiveSimdLevel := TX86SimdLevel.SSE41; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$ELSEIF DEFINED(HASHLIB_FORCE_SSE42)} if FActiveSimdLevel > TX86SimdLevel.SSE42 then FActiveSimdLevel := TX86SimdLevel.SSE42; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; + DisableAllExtraFeatures; {$IFEND} end;