Start on CUDA extension

kshyatt · kshyatt · commit 677f4af940fe · 2025-12-17T05:43:20.000-05:00
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -15,12 +15,12 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
 
   - label: "Julia {{matrix.julia}} -- AMDGPU"
     plugins:
@@ -36,9 +36,9 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
diff --git a/Project.toml b/Project.toml
@@ -18,20 +18,30 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
+
+[sources]
+GPUArrays = {rev = "master", url = "https://github.com/JuliaGPU/GPUArrays.jl"}
+MatrixAlgebraKit = {rev = "main", url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl"}
 
 [extensions]
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
 ArgParse = "1.2.0"
+CUDA = "5.9"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 FiniteDifferences = "0.12"
+GPUArrays = "11.3.1"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
 MatrixAlgebraKit = "0.6.0"
@@ -48,21 +58,26 @@ TestExtras = "0.2,0.3"
 TupleTools = "1.1"
 VectorInterface = "0.4.8, 0.5"
 Zygote = "0.7"
+cuTENSOR = "2"
 julia = "1.10"
 
 [extras]
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["ArgParse", "Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "SafeTestsets", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,27 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, CUDA.CUSOLVER, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, _project_symmetric_and_check
+import TensorKit: randisometry
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+# TODO
+# add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,225 @@
+const CuTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, CuVector{T, CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, CuTensorMap{T, S, N₁, N₂}}
+
+function CuTensorMap{T, S, N₁, N₂}(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+    return CuTensorMap{T, S, N₁, N₂}(CuArray(t.data), t.space)
+end
+
+# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
+function TensorKit._project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
+    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
+    h_t = TensorKit.project_symmetric!(h_t, Array(data))
+    # verify result
+    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:curand, :curandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {S <: IndexSpace}
+            return $randfun(codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn, :randisometry)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                ::Type{A}, codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {A <: CuArray, S <: IndexSpace}
+            return $randfun(A, codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, ::Type{A}, codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {T, S <: IndexSpace, A <: CuArray{T}}
+            return $randfun(T, A, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T}, ::Type{A},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {T, S <: IndexSpace, A <: CuArray{T}}
+            return $randfun(rng, T, A, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        $randfun(::Type{A}, codomain::TensorSpace) where {A <: CuArray} = $randfun(A, codomain ← one(codomain))
+        function $randfun(::Type{T}, ::Type{A}, codomain::TensorSpace) where {T, A <: CuArray{T}}
+            return $randfun(T, A, codomain ← one(codomain))
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                ::Type{A}, codomain::TensorSpace
+            ) where {T, A <: CuArray{T}}
+            return $randfun(rng, T, A, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        $randfun(::Type{A}, V::TensorMapSpace) where {A <: CuArray} = $randfun(eltype(A), A, V)
+        function $randfun(rng::Random.AbstractRNG, ::Type{A}, V::TensorMapSpace) where {A <: CuArray}
+            return $randfun(rng, eltype(A), A, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, ::Type{A}, V::TensorMapSpace) where {T, A <: CuArray{T}}
+            return $randfun(Random.default_rng(), T, A, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                ::Type{A}, V::TensorMapSpace
+            ) where {T, A <: CuArray{T}}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+function Base.convert(::Type{CuTensorMap}, t::AbstractTensorMap)
+    return copy!(CuTensorMap{scalartype(t)}(undef, space(t)), t)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap)
+    # TODO: should scalar only work if N₁ == N₂ == 0?
+    return @allowscalar dim(codomain(t)) == dim(domain(t)) == 1 ?
+        first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
+end
+
+TensorKit.scalartype(A::StridedCuArray{T}) where {T} = T
+TensorKit.scalartype(::Type{<:CuTensorMap{T}}) where {T} = T
+TensorKit.scalartype(::Type{<:CuArray{T}}) where {T} = T
+
+function Base.convert(
+        TT::Type{CuTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function LinearAlgebra.isposdef(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = TensorKit.MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: CuTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: CuTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T, S, N₁, N₂}
+end
+
+# CuTensorMap exponentation:
+function TensorKit.exp!(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        error("Exponential of a tensor only exist when domain == codomain.")
+    for (c, b) in blocks(t)
+        copy!(b, parent(Base.exp(Hermitian(b))))
+    end
+    return t
+end
+
+# functions that don't map ℝ to (a subset of) ℝ
+for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
+    sf = string(f)
+    @eval function Base.$f(t::CuTensorMap)
+        domain(t) == codomain(t) ||
+            throw(SpaceMismatch("`$($sf)` of a tensor only exist when domain == codomain"))
+        T = complex(float(scalartype(t)))
+        tf = similar(t, T)
+        for (c, b) in blocks(t)
+            copy!(block(tf, c), parent($f(Hermitian(b))))
+        end
+        return tf
+    end
+end
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
@@ -273,7 +273,7 @@ function LinearAlgebra.mul!(
         dC::DiagonalTensorMap, dA::DiagonalTensorMap, dB::DiagonalTensorMap, α::Number, β::Number
     )
     dC.domain == dA.domain == dB.domain || throw(SpaceMismatch())
-    mul!(Diagonal(dC.data), Diagonal(dA.data), Diagonal(dB.data), α, β)
+    @. dC.data = (α * dA.data * dB.data) + β * dC.data
     return dC
 end
 
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
@@ -272,7 +272,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p == 2
         n² = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm(b, 2)^2)
         end
         return sqrt(n²)
     elseif p == 1
@@ -281,7 +281,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p > 0
         nᵖ = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
+            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm(b, p)^p)
         end
         return (nᵖ)^inv(oftype(nᵖ, p))
     else
@@ -299,7 +299,7 @@ function LinearAlgebra.rank(
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
     S = LinearAlgebra.svdvals(t)
-    tol = max(atol, rtol * maximum(first, values(S)))
+    tol = max(atol, rtol * mapreduce(maximum, max, values(S)))
     for (c, b) in pairs(S)
         if !isempty(b)
             r += dim(c) * count(>(tol), b)
@@ -317,8 +317,8 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
             return zero(real(float(scalartype(t))))
         end
         S = LinearAlgebra.svdvals(t)
-        maxS = maximum(first, values(S))
-        minS = minimum(last, values(S))
+        maxS = mapreduce(maximum, max, values(S))
+        minS = mapreduce(minimum, min, values(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
     else
         throw(ArgumentError("cond currently only defined for p=2"))
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -273,7 +273,7 @@ function LinearAlgebra.mul!(`
`273`	`273`	`dC::DiagonalTensorMap, dA::DiagonalTensorMap, dB::DiagonalTensorMap, α::Number, β::Number`
`274`	`274`	`)`
`275`	`275`	`dC.domain == dA.domain == dB.domain \|\| throw(SpaceMismatch())`
`276`		`- mul!(Diagonal(dC.data), Diagonal(dA.data), Diagonal(dB.data), α, β)`
	`276`	`+ @. dC.data = (α * dA.data * dB.data) + β * dC.data`
`277`	`277`	`return dC`
`278`	`278`	`end`
`279`	`279`