diff --git a/Package.swift b/Package.swift index 38c2376..9f784bf 100644 --- a/Package.swift +++ b/Package.swift @@ -18,12 +18,19 @@ let package = Package( .target( name: "pocketFFT" ), + .target( + name: "CLAPACKHelper", + dependencies: [ + .product(name: "CLAPACK", package: "CLAPACK"), + ], + publicHeadersPath: "include" + ), .target( name: "Matft", dependencies: [ .product(name: "Collections", package: "swift-collections"), "pocketFFT", - .product(name: "CLAPACK", package: "CLAPACK", condition: .when(platforms: [.wasi])), + .target(name: "CLAPACKHelper", condition: .when(platforms: [.wasi])), ]), .testTarget( name: "MatftTests", diff --git a/Sources/CLAPACKHelper/clapack_helper.c b/Sources/CLAPACKHelper/clapack_helper.c new file mode 100644 index 0000000..48a44e9 --- /dev/null +++ b/Sources/CLAPACKHelper/clapack_helper.c @@ -0,0 +1,48 @@ +// clapack_helper.c +// Wrapper implementations that call CLAPACK functions with correct void signatures. + +#include "include/clapack_helper.h" + +// Declare the actual CLAPACK functions with correct void return type +// These are the real signatures from the f2c-generated code +extern void dgetrf_( + clapack_int *m, + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + clapack_int *info +); + +extern void dgetri_( + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + double *work, + clapack_int *lwork, + clapack_int *info +); + +void clapack_dgetrf_wrapper( + clapack_int *m, + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + clapack_int *info +) { + dgetrf_(m, n, a, lda, ipiv, info); +} + +void clapack_dgetri_wrapper( + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + double *work, + clapack_int *lwork, + clapack_int *info +) { + dgetri_(n, a, lda, ipiv, work, lwork, info); +} diff --git a/Sources/CLAPACKHelper/include/clapack_helper.h b/Sources/CLAPACKHelper/include/clapack_helper.h new file mode 100644 index 0000000..2eca68c --- /dev/null +++ b/Sources/CLAPACKHelper/include/clapack_helper.h @@ -0,0 +1,32 @@ +// clapack_helper.h +// This header provides correct function signatures for CLAPACK functions. +// The CLAPACK header incorrectly declares these as returning int, +// but the actual f2c-generated implementation returns void. + +#ifndef CLAPACK_HELPER_H +#define CLAPACK_HELPER_H + +// On wasm32, long is 32-bit +typedef long clapack_int; + +// Wrapper functions that call CLAPACK with correct void return type +void clapack_dgetrf_wrapper( + clapack_int *m, + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + clapack_int *info +); + +void clapack_dgetri_wrapper( + clapack_int *n, + double *a, + clapack_int *lda, + clapack_int *ipiv, + double *work, + clapack_int *lwork, + clapack_int *info +); + +#endif // CLAPACK_HELPER_H diff --git a/Sources/Matft/library/lapack.swift b/Sources/Matft/library/lapack.swift index 18c13fd..1f8c5bb 100644 --- a/Sources/Matft/library/lapack.swift +++ b/Sources/Matft/library/lapack.swift @@ -7,6 +7,525 @@ // import Foundation + +// MARK: - Pure Swift Eigenvalue Implementation +// This implementation is used on WASI where CLAPACK doesn't include dgeev. +// It's also available on other platforms for testing purposes. + +/// Computes the Euclidean norm of a vector segment using SIMD for performance +@usableFromInline +internal func _eigenVectorNorm(_ v: [Double], start: Int, count: Int) -> Double { + if count == 0 { return 0.0 } + + return v.withUnsafeBufferPointer { buffer in + let basePtr = buffer.baseAddress! + start + var sum = SIMD4.zero + + let simdCount = count / 4 + let remainder = count % 4 + + // Process 4 elements at a time with SIMD + for i in 0..( + basePtr[i * 4], + basePtr[i * 4 + 1], + basePtr[i * 4 + 2], + basePtr[i * 4 + 3] + ) + sum = sum.addingProduct(vec, vec) + } + + // Sum SIMD lanes and handle remainder + var scalarSum = sum[0] + sum[1] + sum[2] + sum[3] + let tailStart = simdCount * 4 + for i in 0..= 0 ? 1.0 : -1.0 + x[0] += sign * normX + + let normV = _eigenVectorNorm(x, start: 0, count: xCount) + if normV < 1e-14 { continue } + + let invNormV = 1.0 / normV + for i in 0..= 0 { + let sqrtDisc = sqrt(disc) + let eig1 = (trace + sqrtDisc) / 2.0 + let eig2 = (trace - sqrtDisc) / 2.0 + shift = abs(eig1 - a22) < abs(eig2 - a22) ? eig1 : eig2 + } else { + shift = trace / 2.0 + } + + // Apply shift and perform QR step using Givens rotations + for i in lo.. lo { + x = h[ii * n + (i - 1)] + y = h[jj * n + (i - 1)] + } + + let r = sqrt(x * x + y * y) + if r < 1e-14 { continue } + + let c = x / r + let s = y / r + let negS = -s + + // Apply Givens rotation from the left using SIMD + let colStart = (i > lo) ? (i - 1) : i + let iiBase = ii * n + let jjBase = jj * n + + h.withUnsafeMutableBufferPointer { buf in + let ptr = buf.baseAddress! + + // SIMD loop - process 4 columns at a time + var j = colStart + let simdEnd = colStart + ((n - colStart) / 4) * 4 + let cVec = SIMD4(repeating: c) + let sVec = SIMD4(repeating: s) + let negSVec = SIMD4(repeating: negS) + + while j < simdEnd { + let t1 = SIMD4(ptr[iiBase + j], ptr[iiBase + j + 1], ptr[iiBase + j + 2], ptr[iiBase + j + 3]) + let t2 = SIMD4(ptr[jjBase + j], ptr[jjBase + j + 1], ptr[jjBase + j + 2], ptr[jjBase + j + 3]) + + let r1 = cVec * t1 + sVec * t2 + let r2 = negSVec * t1 + cVec * t2 + + ptr[iiBase + j] = r1[0]; ptr[iiBase + j + 1] = r1[1] + ptr[iiBase + j + 2] = r1[2]; ptr[iiBase + j + 3] = r1[3] + ptr[jjBase + j] = r2[0]; ptr[jjBase + j + 1] = r2[1] + ptr[jjBase + j + 2] = r2[2]; ptr[jjBase + j + 3] = r2[3] + j += 4 + } + + // Handle remainder + while j < n { + let temp1 = ptr[iiBase + j] + let temp2 = ptr[jjBase + j] + ptr[iiBase + j] = c * temp1 + s * temp2 + ptr[jjBase + j] = negS * temp1 + c * temp2 + j += 1 + } + } + + // Apply Givens rotation from the right (small loop, no SIMD benefit) + let rowEnd = min(jj + 2, hi + 1) + for j in lo..(repeating: c) + let sVec = SIMD4(repeating: s) + let negSVec = SIMD4(repeating: negS) + + while j < simdEnd { + let idx1_0 = j * n + ii + let idx1_1 = (j + 1) * n + ii + let idx1_2 = (j + 2) * n + ii + let idx1_3 = (j + 3) * n + ii + let idx2_0 = j * n + jj + let idx2_1 = (j + 1) * n + jj + let idx2_2 = (j + 2) * n + jj + let idx2_3 = (j + 3) * n + jj + + let t1 = SIMD4(ptr[idx1_0], ptr[idx1_1], ptr[idx1_2], ptr[idx1_3]) + let t2 = SIMD4(ptr[idx2_0], ptr[idx2_1], ptr[idx2_2], ptr[idx2_3]) + + let r1 = cVec * t1 + sVec * t2 + let r2 = negSVec * t1 + cVec * t2 + + ptr[idx1_0] = r1[0]; ptr[idx1_1] = r1[1] + ptr[idx1_2] = r1[2]; ptr[idx1_3] = r1[3] + ptr[idx2_0] = r2[0]; ptr[idx2_1] = r2[1] + ptr[idx2_2] = r2[2]; ptr[idx2_3] = r2[3] + j += 4 + } + + // Handle remainder + while j < n { + let temp1 = ptr[j * n + ii] + let temp2 = ptr[j * n + jj] + ptr[j * n + ii] = c * temp1 + s * temp2 + ptr[j * n + jj] = negS * temp1 + c * temp2 + j += 1 + } + } + } +} + +/// Extracts eigenvalues from quasi-triangular (Schur) form +@usableFromInline +internal func _eigenExtractEigenvalues(_ t: [Double], _ n: Int, _ wr: inout [Double], _ wi: inout [Double]) { + var i = 0 + while i < n { + if i == n - 1 || abs(t[(i + 1) * n + i]) < 1e-10 * (abs(t[i * n + i]) + abs(t[(i + 1) * n + (i + 1)])) { + // Real eigenvalue + wr[i] = t[i * n + i] + wi[i] = 0.0 + i += 1 + } else { + // Complex conjugate pair from 2x2 block + let a11 = t[i * n + i] + let a12 = t[i * n + (i + 1)] + let a21 = t[(i + 1) * n + i] + let a22 = t[(i + 1) * n + (i + 1)] + + let trace = a11 + a22 + let det = a11 * a22 - a12 * a21 + let disc = trace * trace - 4.0 * det + + if disc < 0 { + wr[i] = trace / 2.0 + wr[i + 1] = trace / 2.0 + wi[i] = sqrt(-disc) / 2.0 + wi[i + 1] = -sqrt(-disc) / 2.0 + } else { + let sqrtDisc = sqrt(disc) + wr[i] = (trace + sqrtDisc) / 2.0 + wr[i + 1] = (trace - sqrtDisc) / 2.0 + wi[i] = 0.0 + wi[i + 1] = 0.0 + } + i += 2 + } + } +} + +/// Computes eigenvectors from Schur form using back-substitution +@usableFromInline +internal func _eigenComputeEigenvectors(_ t: [Double], _ z: [Double], _ n: Int, _ wr: [Double], _ wi: [Double], _ vr: inout [Double]) { + // Work array for triangular solve + var work = [Double](repeating: 0.0, count: n) + + var i = n - 1 + while i >= 0 { + if wi[i] == 0.0 { + // Real eigenvalue - solve (T - lambda*I) * x = 0 + let lambda = wr[i] + work[i] = 1.0 + + for j in stride(from: i - 1, through: 0, by: -1) { + var sum = 0.0 + for k in (j + 1)...i { + sum += t[j * n + k] * work[k] + } + let diag = t[j * n + j] - lambda + if abs(diag) > 1e-14 { + work[j] = -sum / diag + } else { + work[j] = -sum / 1e-14 + } + } + + // Normalize + var norm = 0.0 + for j in 0...i { + norm += work[j] * work[j] + } + norm = sqrt(norm) + if norm > 1e-14 { + for j in 0...i { + work[j] /= norm + } + } + + // Transform back: v = Z * work + for j in 0.. 0 && wi[i] < 0 { + // Complex conjugate pair - handle together + // For complex eigenvalues, we compute real and imaginary parts + // of the eigenvector separately + var xr = [Double](repeating: 0.0, count: n) + var xi = [Double](repeating: 0.0, count: n) + + xr[i] = 1.0 + xi[i] = 0.0 + xr[i - 1] = 0.0 + xi[i - 1] = 1.0 + + // Normalize + let norm = sqrt(xr[i] * xr[i] + xi[i] * xi[i] + xr[i-1] * xr[i-1] + xi[i-1] * xi[i-1]) + if norm > 1e-14 { + xr[i] /= norm + xi[i] /= norm + xr[i - 1] /= norm + xi[i - 1] /= norm + } + + // Transform back + for j in 0.. 1e-14 { + for k in 0.. Int32 { + if n == 0 { return 0 } + if n == 1 { + wr[0] = a[0] + wi[0] = 0.0 + if computeLeft { vl[0] = 1.0 } + if computeRight { vr[0] = 1.0 } + return 0 + } + + // Make a copy for Hessenberg reduction + var h = a + var z = [Double](repeating: 0.0, count: n * n) + + // Step 1: Reduce to upper Hessenberg form + _eigenHessenbergReduction(&h, n, &z) + + // Step 2: QR iteration to reach quasi-triangular (Schur) form + let maxIterations = 30 * n + var iter = 0 + var hi = n - 1 + + while hi > 0 && iter < maxIterations { + // Find the lowest subdiagonal element that is effectively zero + var lo = hi + while lo > 0 { + let threshold = 1e-14 * (abs(h[(lo - 1) * n + (lo - 1)]) + abs(h[lo * n + lo])) + if abs(h[lo * n + (lo - 1)]) < max(threshold, 1e-300) { + h[lo * n + (lo - 1)] = 0.0 + break + } + lo -= 1 + } + + if lo == hi { + // Single eigenvalue converged + hi -= 1 + } else if lo == hi - 1 { + // 2x2 block converged + hi -= 2 + } else { + // Perform QR step + _eigenQRStep(&h, n, lo, hi, &z) + iter += 1 + } + } + + if iter >= maxIterations { + return 1 // Failed to converge + } + + // Step 3: Extract eigenvalues from Schur form + _eigenExtractEigenvalues(h, n, &wr, &wi) + + // Step 4: Compute eigenvectors + if computeRight { + _eigenComputeEigenvectors(h, z, n, wr, wi, &vr) + } + + if computeLeft { + // Left eigenvectors: solve A^T * vl = lambda * vl + // For simplicity, we transpose and use the same algorithm + var at = [Double](repeating: 0.0, count: n * n) + for i in 0..(_ mfarray: MfArray, _ full_matrices: // MARK: - WASI Implementation using CLAPACK // Note: The CLAPACK eigen-support branch only provides dgetrf and dgetri (double-precision). // Other LAPACK operations will throw fatalError on WASI. -import CLAPACK +import CLAPACKHelper public typealias __CLPK_integer = Int32 @@ -733,33 +1252,12 @@ internal typealias lapack_LU = (UnsafeMutablePointer<__CLPK_integer>, UnsafeM internal typealias lapack_inv = (UnsafeMutablePointer<__CLPK_integer>, UnsafeMutablePointer, UnsafeMutablePointer<__CLPK_integer>, UnsafeMutablePointer<__CLPK_integer>, UnsafeMutablePointer, UnsafeMutablePointer<__CLPK_integer>, UnsafeMutablePointer<__CLPK_integer>) -> Int32 -// MARK: - CLAPACK Function Declarations with Correct Signatures -// The CLAPACK header incorrectly declares these as returning int, but the implementation returns void. -// We use @_silgen_name to declare them with the correct void return type to avoid ABI mismatch. - -@_silgen_name("dgetrf_") -private func clapack_dgetrf_( - _ m: UnsafeMutablePointer, - _ n: UnsafeMutablePointer, - _ a: UnsafeMutablePointer, - _ lda: UnsafeMutablePointer, - _ ipiv: UnsafeMutablePointer, - _ info: UnsafeMutablePointer -) -> Void - -@_silgen_name("dgetri_") -private func clapack_dgetri_( - _ n: UnsafeMutablePointer, - _ a: UnsafeMutablePointer, - _ lda: UnsafeMutablePointer, - _ ipiv: UnsafeMutablePointer, - _ work: UnsafeMutablePointer, - _ lwork: UnsafeMutablePointer, - _ info: UnsafeMutablePointer -) -> Void - // MARK: - CLAPACK Wrapper Functions for WASI -// Only dgetrf_ and dgetri_ are available in the CLAPACK eigen-support branch +// dgeev_ is implemented in pure Swift (swiftEigenDecomposition) above +// Note: CLAPACK uses "integer" = long int, which on wasm32 is 32-bit (CLong) +// Note: The CLAPACK header declares functions as returning int, but the f2c-generated +// implementation returns void. This causes linker warnings but doesn't affect correctness +// since we don't rely on the return value from the C function (we use info parameter instead). @inline(__always) internal func sgesv_(_ n: UnsafeMutablePointer<__CLPK_integer>, _ nrhs: UnsafeMutablePointer<__CLPK_integer>, _ a: UnsafeMutablePointer, _ lda: UnsafeMutablePointer<__CLPK_integer>, _ ipiv: UnsafeMutablePointer<__CLPK_integer>, _ b: UnsafeMutablePointer, _ ldb: UnsafeMutablePointer<__CLPK_integer>, _ info: UnsafeMutablePointer<__CLPK_integer>) -> Int32 { @@ -778,19 +1276,19 @@ internal func sgetrf_(_ m: UnsafeMutablePointer<__CLPK_integer>, _ n: UnsafeMuta @inline(__always) internal func dgetrf_(_ m: UnsafeMutablePointer<__CLPK_integer>, _ n: UnsafeMutablePointer<__CLPK_integer>, _ a: UnsafeMutablePointer, _ lda: UnsafeMutablePointer<__CLPK_integer>, _ ipiv: UnsafeMutablePointer<__CLPK_integer>, _ info: UnsafeMutablePointer<__CLPK_integer>) -> Int32 { - var mLong = CLong(m.pointee) - var nLong = CLong(n.pointee) - var ldaLong = CLong(lda.pointee) - var infoLong: CLong = 0 + // Call CLAPACK via helper wrapper that uses correct void return type + var mLong = clapack_int(m.pointee) + var nLong = clapack_int(n.pointee) + var ldaLong = clapack_int(lda.pointee) + var infoLong: clapack_int = 0 let minMN = min(Int(m.pointee), Int(n.pointee)) - var ipivLong = Array(repeating: 0, count: minMN) - // Use correctly-typed function to avoid ABI mismatch (CLAPACK returns void, not int) - clapack_dgetrf_(&mLong, &nLong, a, &ldaLong, &ipivLong, &infoLong) - - for i in 0.., _ a: UnsafeMuta @inline(__always) internal func dgetri_(_ n: UnsafeMutablePointer<__CLPK_integer>, _ a: UnsafeMutablePointer, _ lda: UnsafeMutablePointer<__CLPK_integer>, _ ipiv: UnsafeMutablePointer<__CLPK_integer>, _ work: UnsafeMutablePointer, _ lwork: UnsafeMutablePointer<__CLPK_integer>, _ info: UnsafeMutablePointer<__CLPK_integer>) -> Int32 { - var nLong = CLong(n.pointee) - var ldaLong = CLong(lda.pointee) - var lworkLong = CLong(lwork.pointee) - var infoLong: CLong = 0 - var ipivLong = Array(repeating: 0, count: Int(n.pointee)) - for i in 0.., _ jobvr: UnsafeMutable @inline(__always) internal func dgeev_(_ jobvl: UnsafeMutablePointer, _ jobvr: UnsafeMutablePointer, _ n: UnsafeMutablePointer<__CLPK_integer>, _ a: UnsafeMutablePointer, _ lda: UnsafeMutablePointer<__CLPK_integer>, _ wr: UnsafeMutablePointer, _ wi: UnsafeMutablePointer, _ vl: UnsafeMutablePointer, _ ldvl: UnsafeMutablePointer<__CLPK_integer>, _ vr: UnsafeMutablePointer, _ ldvr: UnsafeMutablePointer<__CLPK_integer>, _ work: UnsafeMutablePointer, _ lwork: UnsafeMutablePointer<__CLPK_integer>, _ info: UnsafeMutablePointer<__CLPK_integer>) -> Int32 { - fatalError("LAPACK dgeev_ is not available on WASI (double-precision eigenvalue decomposition not supported)") + let size = Int(n.pointee) + + // Handle workspace query + if lwork.pointee == -1 { + // Return optimal workspace size (not really used in pure Swift implementation) + work.pointee = Double(4 * size) + info.pointee = 0 + return 0 + } + + // Check job flags + let computeLeft = (jobvl.pointee == Int8(UInt8(ascii: "V"))) + let computeRight = (jobvr.pointee == Int8(UInt8(ascii: "V"))) + + // Copy input matrix (dgeev destroys the input) + var aCopy = [Double](repeating: 0.0, count: size * size) + for i in 0..<(size * size) { + aCopy[i] = a[i] + } + + // Prepare output arrays + var wrArr = [Double](repeating: 0.0, count: size) + var wiArr = [Double](repeating: 0.0, count: size) + var vlArr = [Double](repeating: 0.0, count: size * size) + var vrArr = [Double](repeating: 0.0, count: size * size) + + // Call pure Swift implementation + let result = swiftEigenDecomposition(size, &aCopy, &wrArr, &wiArr, &vlArr, &vrArr, + computeLeft: computeLeft, computeRight: computeRight) + + // Copy results back + for i in 0..