From 30b42392b1ae21cb5474d042f2cda61c6cf4db8f Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Wed, 15 Oct 2025 18:19:19 -0400 Subject: [PATCH 1/6] Add tests for image and null space for GPU --- src/implementations/truncation.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/implementations/truncation.jl b/src/implementations/truncation.jl index 883c7759..ad78f7fe 100644 --- a/src/implementations/truncation.jl +++ b/src/implementations/truncation.jl @@ -17,7 +17,10 @@ function truncate(::typeof(left_null!), (U, S), strategy::TruncationStrategy) # TODO: avoid allocation? extended_S = vcat(diagview(S), zeros(eltype(S), max(0, size(S, 1) - size(S, 2)))) ind = findtruncated(extended_S, strategy) - return U[:, ind], ind + trunc_cols = collect(1:size(U, 2))[ind] + Utrunc = similar(U, (size(U, 1), length(trunc_cols))) + Utrunc .= U[:, trunc_cols] + return Utrunc, ind end function truncate(::typeof(right_null!), (S, Vᴴ), strategy::TruncationStrategy) # TODO: avoid allocation? From 8f449d4daeec432160a2654a6be25c6c71ce431d Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Thu, 16 Oct 2025 14:58:02 +0200 Subject: [PATCH 2/6] Only use the scalar method for AMDGPU --- src/implementations/truncation.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/implementations/truncation.jl b/src/implementations/truncation.jl index ad78f7fe..883c7759 100644 --- a/src/implementations/truncation.jl +++ b/src/implementations/truncation.jl @@ -17,10 +17,7 @@ function truncate(::typeof(left_null!), (U, S), strategy::TruncationStrategy) # TODO: avoid allocation? extended_S = vcat(diagview(S), zeros(eltype(S), max(0, size(S, 1) - size(S, 2)))) ind = findtruncated(extended_S, strategy) - trunc_cols = collect(1:size(U, 2))[ind] - Utrunc = similar(U, (size(U, 1), length(trunc_cols))) - Utrunc .= U[:, trunc_cols] - return Utrunc, ind + return U[:, ind], ind end function truncate(::typeof(right_null!), (S, Vᴴ), strategy::TruncationStrategy) # TODO: avoid allocation? From f2822ca8049b991b31d7403a8d6dd2d4254e909a Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Tue, 26 Aug 2025 04:28:39 -0400 Subject: [PATCH 3/6] Updates for TensorKit compatibility --- Project.toml | 2 ++ .../MatrixAlgebraKitAMDGPUExt.jl | 2 ++ .../MatrixAlgebraKitCUDAExt.jl | 31 +++++++++++++++---- ext/MatrixAlgebraKitCUDAExt/yacusolver.jl | 7 +++-- src/implementations/eig.jl | 5 +-- src/implementations/eigh.jl | 1 + src/implementations/svd.jl | 2 +- test/amd/orthnull.jl | 2 +- test/amd/polar.jl | 3 +- test/cuda/polar.jl | 2 -- test/cuda/projections.jl | 4 +-- 11 files changed, 43 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index 934c0ceb..d7044a33 100644 --- a/Project.toml +++ b/Project.toml @@ -38,8 +38,10 @@ Zygote = "0.7" julia = "1.10" [extras] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" diff --git a/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl b/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl index 04bee40e..f1f82b32 100644 --- a/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl +++ b/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl @@ -28,6 +28,8 @@ function MatrixAlgebraKit.default_eigh_algorithm(::Type{T}; kwargs...) where {T return ROCSOLVER_DivideAndConquer(; kwargs...) end +MatrixAlgebraKit.ishermitian_exact(A::StridedROCMatrix) = ishermitian(A) + _gpu_geqrf!(A::StridedROCMatrix) = YArocSOLVER.geqrf!(A) _gpu_ungqr!(A::StridedROCMatrix, τ::StridedROCVector) = YArocSOLVER.ungqr!(A, τ) _gpu_unmqr!(side::AbstractChar, trans::AbstractChar, A::StridedROCMatrix, τ::StridedROCVector, C::StridedROCVecOrMat) = diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl index 68d25c36..aa155b73 100644 --- a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl +++ b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl @@ -8,30 +8,49 @@ using MatrixAlgebraKit: LQViaTransposedQR, TruncationByValue, AbstractAlgorithm using MatrixAlgebraKit: default_qr_algorithm, default_lq_algorithm, default_svd_algorithm, default_eig_algorithm, default_eigh_algorithm import MatrixAlgebraKit: _gpu_geqrf!, _gpu_ungqr!, _gpu_unmqr!, _gpu_gesvd!, _gpu_Xgesvdp!, _gpu_Xgesvdr!, _gpu_gesvdj!, _gpu_geev! import MatrixAlgebraKit: _gpu_heevj!, _gpu_heevd! -using CUDA +using CUDA, CUDA.CUBLAS using CUDA: i32 using LinearAlgebra using LinearAlgebra: BlasFloat +using CUDA: i32 + include("yacusolver.jl") -function MatrixAlgebraKit.default_qr_algorithm(::Type{T}; kwargs...) where {T <: StridedCuMatrix} +function MatrixAlgebraKit.default_qr_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} return CUSOLVER_HouseholderQR(; kwargs...) end -function MatrixAlgebraKit.default_lq_algorithm(::Type{T}; kwargs...) where {T <: StridedCuMatrix} +function MatrixAlgebraKit.default_lq_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} qr_alg = CUSOLVER_HouseholderQR(; kwargs...) return LQViaTransposedQR(qr_alg) end -function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {T <: StridedCuMatrix} +function MatrixAlgebraKit.default_svd_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} return CUSOLVER_QRIteration(; kwargs...) end -function MatrixAlgebraKit.default_eig_algorithm(::Type{T}; kwargs...) where {T <: StridedCuMatrix} +function MatrixAlgebraKit.default_eig_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} return CUSOLVER_Simple(; kwargs...) end -function MatrixAlgebraKit.default_eigh_algorithm(::Type{T}; kwargs...) where {T <: StridedCuMatrix} +function MatrixAlgebraKit.default_eigh_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} return CUSOLVER_DivideAndConquer(; kwargs...) end +# include for block sector support +function MatrixAlgebraKit.default_qr_algorithm(::Type{Base.ReshapedArray{T, 2, SubArray{T, 1, A, Tuple{UnitRange{Int}}, true}, Tuple{}}}; kwargs...) where {T <: BlasFloat, A <: CuVecOrMat{T}} + return CUSOLVER_HouseholderQR(; kwargs...) +end +function MatrixAlgebraKit.default_lq_algorithm(::Type{Base.ReshapedArray{T, 2, SubArray{T, 1, A, Tuple{UnitRange{Int}}, true}, Tuple{}}}; kwargs...) where {T <: BlasFloat, A <: CuVecOrMat{T}} + qr_alg = CUSOLVER_HouseholderQR(; kwargs...) + return LQViaTransposedQR(qr_alg) +end +function MatrixAlgebraKit.default_svd_algorithm(::Type{Base.ReshapedArray{T, 2, SubArray{T, 1, A, Tuple{UnitRange{Int}}, true}, Tuple{}}}; kwargs...) where {T <: BlasFloat, A <: CuVecOrMat{T}} + return CUSOLVER_Jacobi(; kwargs...) +end +function MatrixAlgebraKit.default_eig_algorithm(::Type{Base.ReshapedArray{T, 2, SubArray{T, 1, A, Tuple{UnitRange{Int}}, true}, Tuple{}}}; kwargs...) where {T <: BlasFloat, A <: CuVecOrMat{T}} + return CUSOLVER_Simple(; kwargs...) +end +function MatrixAlgebraKit.default_eigh_algorithm(::Type{Base.ReshapedArray{T, 2, SubArray{T, 1, A, Tuple{UnitRange{Int}}, true}, Tuple{}}}; kwargs...) where {T <: BlasFloat, A <: CuVecOrMat{T}} + return CUSOLVER_DivideAndConquer(; kwargs...) +end _gpu_geev!(A::StridedCuMatrix, D::StridedCuVector, V::StridedCuMatrix) = YACUSOLVER.Xgeev!(A, D, V) diff --git a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl index 62f0f224..aba01fe6 100644 --- a/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl +++ b/ext/MatrixAlgebraKitCUDAExt/yacusolver.jl @@ -30,7 +30,7 @@ for (bname, fname, elty, relty) in ) chkstride1(A, U, Vᴴ, S) m, n = size(A) - (m < n) && throw(ArgumentError("CUSOLVER's gesvd requires m ≥ n")) + (m < n) && throw(ArgumentError(lazy"CUSOLVER's gesvd requires m ($m) ≥ n ($n)")) minmn = min(m, n) if length(U) == 0 jobu = 'N' @@ -191,14 +191,17 @@ for (bname, fname, elty, relty) in (:cusolverDnZgesvdj_bufferSize, :cusolverDnZgesvdj, :ComplexF64, :Float64), ) @eval begin + #! format: off function gesvdj!( A::StridedCuMatrix{$elty}, S::StridedCuVector{$relty} = similar(A, $relty, min(size(A)...)), U::StridedCuMatrix{$elty} = similar(A, $elty, size(A, 1), min(size(A)...)), Vᴴ::StridedCuMatrix{$elty} = similar(A, $elty, min(size(A)...), size(A, 2)); tol::$relty = eps($relty), - max_sweeps::Int = 100 + max_sweeps::Int = 100, + kwargs... ) + #! format: on chkstride1(A, U, Vᴴ, S) m, n = size(A) minmn = min(m, n) diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl index d491ee94..f76f2b41 100644 --- a/src/implementations/eig.jl +++ b/src/implementations/eig.jl @@ -149,8 +149,9 @@ function eig_vals!(A::AbstractMatrix, D, alg::GPU_EigAlgorithm) check_input(eig_vals!, A, D, alg) V = similar(A, complex(eltype(A)), (size(A, 1), 0)) if alg isa GPU_Simple - isempty(alg.kwargs) || - throw(ArgumentError("LAPACK_Simple (geev) does not accept any keyword arguments")) + # TODO filter out nothing kwargs + #isempty(alg.kwargs) || + # throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments")) _gpu_geev!(A, D, V) end return D diff --git a/src/implementations/eigh.jl b/src/implementations/eigh.jl index 0cfa6db0..13d0b9d3 100644 --- a/src/implementations/eigh.jl +++ b/src/implementations/eigh.jl @@ -50,6 +50,7 @@ function check_input(::typeof(eigh_full!), A::AbstractMatrix, DV, alg::DiagonalA @check_scalar(V, A) return nothing end + function check_input(::typeof(eigh_vals!), A::AbstractMatrix, D, alg::DiagonalAlgorithm) check_hermitian(A, alg) @assert isdiag(A) diff --git a/src/implementations/svd.jl b/src/implementations/svd.jl index fed36cd1..ba54f233 100644 --- a/src/implementations/svd.jl +++ b/src/implementations/svd.jl @@ -418,7 +418,7 @@ end _argmaxabs(x) = reduce(_largest, x; init = zero(eltype(x))) _largest(x, y) = abs(x) < abs(y) ? y : x -function MatrixAlgebraKit.svd_vals!(A::AbstractMatrix, S, alg::GPU_SVDAlgorithm) +function svd_vals!(A::AbstractMatrix, S, alg::GPU_SVDAlgorithm) check_input(svd_vals!, A, S, alg) U, Vᴴ = similar(A, (0, 0)), similar(A, (0, 0)) if alg isa GPU_QRIteration diff --git a/test/amd/orthnull.jl b/test/amd/orthnull.jl index 6ed44228..3223979c 100644 --- a/test/amd/orthnull.jl +++ b/test/amd/orthnull.jl @@ -25,7 +25,7 @@ eltypes = (Float32, Float64, ComplexF32, ComplexF64) @test N isa ROCMatrix{T} && size(N) == (m, m - minmn) @test V * C ≈ A @test isisometric(V) - @test norm(A' * N) ≈ 0 atol = MatrixAlgebraKit.defaulttol(T) + @test LinearAlgebra.norm(A' * N) ≈ 0 atol = MatrixAlgebraKit.defaulttol(T) @test isisometric(N) hV = collect(V) hN = collect(N) diff --git a/test/amd/polar.jl b/test/amd/polar.jl index 066c0f78..cab16b24 100644 --- a/test/amd/polar.jl +++ b/test/amd/polar.jl @@ -13,7 +13,6 @@ using AMDGPU k = min(m, n) svd_algs = (ROCSOLVER_QRIteration(), ROCSOLVER_Jacobi()) @testset "algorithm $svd_alg" for svd_alg in svd_algs - n < m && svd_alg isa ROCSOLVER_QRIteration && continue A = ROCArray(randn(rng, T, m, n)) alg = PolarViaSVD(svd_alg) W, P = left_polar(A; alg) @@ -23,6 +22,7 @@ using AMDGPU @test isisometric(W) # work around extremely strict Julia criteria for Hermiticity @test ishermitian(P; rtol = MatrixAlgebraKit.defaulttol(P)) && isposdef(Hermitian(P)) + @test isposdef(P) Ac = similar(A) W2, P2 = @constinferred left_polar!(copy!(Ac, A), (W, P), alg) @@ -52,7 +52,6 @@ end k = min(m, n) svd_algs = (ROCSOLVER_QRIteration(), ROCSOLVER_Jacobi()) @testset "algorithm $svd_alg" for svd_alg in svd_algs - n > m && svd_alg isa ROCSOLVER_QRIteration && continue A = ROCArray(randn(rng, T, m, n)) alg = PolarViaSVD(svd_alg) P, Wᴴ = right_polar(A; alg) diff --git a/test/cuda/polar.jl b/test/cuda/polar.jl index a7a76f27..1f512367 100644 --- a/test/cuda/polar.jl +++ b/test/cuda/polar.jl @@ -13,7 +13,6 @@ using CUDA k = min(m, n) svd_algs = (CUSOLVER_QRIteration(), CUSOLVER_Jacobi()) @testset "algorithm $svd_alg" for svd_alg in svd_algs - n < m && svd_alg isa CUSOLVER_QRIteration && continue A = CuArray(randn(rng, T, m, n)) alg = PolarViaSVD(svd_alg) W, P = left_polar(A; alg) @@ -52,7 +51,6 @@ end k = min(m, n) svd_algs = (CUSOLVER_QRIteration(), CUSOLVER_Jacobi()) @testset "algorithm $svd_alg" for svd_alg in svd_algs - n > m && svd_alg isa CUSOLVER_QRIteration && continue A = CuArray(randn(rng, T, m, n)) alg = PolarViaSVD(svd_alg) P, Wᴴ = right_polar(A; alg) diff --git a/test/cuda/projections.jl b/test/cuda/projections.jl index d9f7bd7f..d3b92e08 100644 --- a/test/cuda/projections.jl +++ b/test/cuda/projections.jl @@ -12,7 +12,7 @@ const BLASFloats = (Float32, Float64, ComplexF32, ComplexF64) m = 54 noisefactor = eps(real(T))^(3 / 4) for alg in (NativeBlocked(blocksize = 16), NativeBlocked(blocksize = 32), NativeBlocked(blocksize = 64)) - A = CuArray(randn(rng, T, m, m)) + A = CuArray(randn(rng, T, m, m)) Ah = (A + A') / 2 Aa = (A - A') / 2 Ac = copy(A) @@ -69,7 +69,7 @@ end # test that W is closer to A then any other isometry for k in 1:10 δA = CuArray(randn(rng, T, m, n)) - W = project_isometric(A, alg) + W = project_isometric(A, alg) W2 = project_isometric(A + δA / 100, alg) @test norm(A - W2) > norm(A - W) end From fb3abd04f4eb54d03ad136efebffbe004a344e39 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Mon, 10 Nov 2025 05:29:10 -0500 Subject: [PATCH 4/6] Fix AMDGPU duplication --- ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl b/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl index f1f82b32..04bee40e 100644 --- a/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl +++ b/ext/MatrixAlgebraKitAMDGPUExt/MatrixAlgebraKitAMDGPUExt.jl @@ -28,8 +28,6 @@ function MatrixAlgebraKit.default_eigh_algorithm(::Type{T}; kwargs...) where {T return ROCSOLVER_DivideAndConquer(; kwargs...) end -MatrixAlgebraKit.ishermitian_exact(A::StridedROCMatrix) = ishermitian(A) - _gpu_geqrf!(A::StridedROCMatrix) = YArocSOLVER.geqrf!(A) _gpu_ungqr!(A::StridedROCMatrix, τ::StridedROCVector) = YArocSOLVER.ungqr!(A, τ) _gpu_unmqr!(side::AbstractChar, trans::AbstractChar, A::StridedROCMatrix, τ::StridedROCVector, C::StridedROCVecOrMat) = From 44d3cc88f283f263b14667e402288fb212e2606c Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Mon, 10 Nov 2025 07:47:52 -0500 Subject: [PATCH 5/6] Fix AMD polar test --- test/amd/polar.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/amd/polar.jl b/test/amd/polar.jl index cab16b24..4040b674 100644 --- a/test/amd/polar.jl +++ b/test/amd/polar.jl @@ -22,7 +22,6 @@ using AMDGPU @test isisometric(W) # work around extremely strict Julia criteria for Hermiticity @test ishermitian(P; rtol = MatrixAlgebraKit.defaulttol(P)) && isposdef(Hermitian(P)) - @test isposdef(P) Ac = similar(A) W2, P2 = @constinferred left_polar!(copy!(Ac, A), (W, P), alg) From ffb0119881ef6ccf763eca516d941c1e7eba7143 Mon Sep 17 00:00:00 2001 From: Katharine Hyatt Date: Mon, 10 Nov 2025 13:13:24 -0500 Subject: [PATCH 6/6] Comments --- ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl | 2 -- src/implementations/eig.jl | 7 +++---- test/cuda/projections.jl | 4 ++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl index aa155b73..e4c77453 100644 --- a/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl +++ b/ext/MatrixAlgebraKitCUDAExt/MatrixAlgebraKitCUDAExt.jl @@ -13,8 +13,6 @@ using CUDA: i32 using LinearAlgebra using LinearAlgebra: BlasFloat -using CUDA: i32 - include("yacusolver.jl") function MatrixAlgebraKit.default_qr_algorithm(::Type{T}; kwargs...) where {TT <: BlasFloat, T <: StridedCuMatrix{TT}} diff --git a/src/implementations/eig.jl b/src/implementations/eig.jl index f76f2b41..8532c29a 100644 --- a/src/implementations/eig.jl +++ b/src/implementations/eig.jl @@ -137,7 +137,7 @@ function eig_full!(A::AbstractMatrix, DV, alg::GPU_EigAlgorithm) D, V = DV if alg isa GPU_Simple isempty(alg.kwargs) || - throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments")) + @warn "GPU_Simple (geev) does not accept any keyword arguments" _gpu_geev!(A, D.diag, V) end # TODO: make this controllable using a `gaugefix` keyword argument @@ -149,9 +149,8 @@ function eig_vals!(A::AbstractMatrix, D, alg::GPU_EigAlgorithm) check_input(eig_vals!, A, D, alg) V = similar(A, complex(eltype(A)), (size(A, 1), 0)) if alg isa GPU_Simple - # TODO filter out nothing kwargs - #isempty(alg.kwargs) || - # throw(ArgumentError("GPU_Simple (geev) does not accept any keyword arguments")) + isempty(alg.kwargs) || + @warn "GPU_Simple (geev) does not accept any keyword arguments" _gpu_geev!(A, D, V) end return D diff --git a/test/cuda/projections.jl b/test/cuda/projections.jl index d3b92e08..d9f7bd7f 100644 --- a/test/cuda/projections.jl +++ b/test/cuda/projections.jl @@ -12,7 +12,7 @@ const BLASFloats = (Float32, Float64, ComplexF32, ComplexF64) m = 54 noisefactor = eps(real(T))^(3 / 4) for alg in (NativeBlocked(blocksize = 16), NativeBlocked(blocksize = 32), NativeBlocked(blocksize = 64)) - A = CuArray(randn(rng, T, m, m)) + A = CuArray(randn(rng, T, m, m)) Ah = (A + A') / 2 Aa = (A - A') / 2 Ac = copy(A) @@ -69,7 +69,7 @@ end # test that W is closer to A then any other isometry for k in 1:10 δA = CuArray(randn(rng, T, m, n)) - W = project_isometric(A, alg) + W = project_isometric(A, alg) W2 = project_isometric(A + δA / 100, alg) @test norm(A - W2) > norm(A - W) end