From 3d1ba4e018085a1d8719ce655990482e31f3df41 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 7 Oct 2025 10:45:42 -0400
Subject: [PATCH 01/19] Start on CUDA extension

---
 .buildkite/pipeline.yml                  |   8 +-
 Project.toml                             |  15 +-
 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl |  21 +
 ext/TensorKitCUDAExt/cutensormap.jl      | 161 ++++++
 src/tensors/diagonal.jl                  |  10 +-
 src/tensors/linalg.jl                    |  23 +-
 src/tensors/tensor.jl                    |  73 +--
 test/cuda/tensors.jl                     | 604 +++++++++++++++++++++++
 test/runtests.jl                         |   8 +-
 9 files changed, 860 insertions(+), 63 deletions(-)
 create mode 100644 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
 create mode 100644 ext/TensorKitCUDAExt/cutensormap.jl
 create mode 100644 test/cuda/tensors.jl

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6c414a31b..2b7905fc4 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -15,12 +15,12 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
 
   - label: "Julia {{matrix.julia}} -- AMDGPU"
     plugins:
@@ -36,9 +36,9 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
diff --git a/Project.toml b/Project.toml
index 6c22acc60..c0c05603f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,20 +18,26 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
 ArgParse = "1.2.0"
+CUDA = "5.9"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 FiniteDifferences = "0.12"
+GPUArrays = "11.3.1"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
 MatrixAlgebraKit = "0.6.0"
@@ -48,21 +54,26 @@ TestExtras = "0.2,0.3"
 TupleTools = "1.1"
 VectorInterface = "0.4.8, 0.5"
 Zygote = "0.7"
+cuTENSOR = "2"
 julia = "1.10"
 
 [extras]
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["ArgParse", "Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "SafeTestsets", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
new file mode 100644
index 000000000..0562acf61
--- /dev/null
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,21 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, CUDA.CUSOLVER, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
+import TensorKit: randisometry, rand, randn
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
new file mode 100644
index 000000000..5ae602cfa
--- /dev/null
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,161 @@
+const CuTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, CuVector{T, CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, CuTensorMap{T, S, N₁, N₂}}
+
+function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+    return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
+end
+
+function Base.collect(t::CuTensorMap{T}) where {T}
+    return convert(TensorKit.TensorMapWithStorage{T, Vector{T}}, t)
+end
+
+# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
+function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
+    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
+    h_t = TensorKit.project_symmetric!(h_t, Array(data))
+    # verify result
+    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:curand, :curandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {S <: IndexSpace}
+            return $randfun(codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
+    inds = findall(!iszero, t.data)
+    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
+end
+
+function Base.convert(
+        TT::Type{CuTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function LinearAlgebra.isposdef(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = TensorKit.MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: CuTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: CuTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T, S, N₁, N₂}
+end
+
+# CuTensorMap exponentation:
+function TensorKit.exp!(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        error("Exponential of a tensor only exist when domain == codomain.")
+    for (c, b) in blocks(t)
+        copy!(b, parent(Base.exp(Hermitian(b))))
+    end
+    return t
+end
+
+# functions that don't map ℝ to (a subset of) ℝ
+for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
+    sf = string(f)
+    @eval function Base.$f(t::CuTensorMap)
+        domain(t) == codomain(t) ||
+            throw(SpaceMismatch("`$($sf)` of a tensor only exist when domain == codomain"))
+        T = complex(float(scalartype(t)))
+        tf = similar(t, T)
+        for (c, b) in blocks(t)
+            copy!(block(tf, c), parent($f(Hermitian(b))))
+        end
+        return tf
+    end
+end
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index e35c2ffdd..356f5dbf4 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -78,12 +78,10 @@ function DiagonalTensorMap(t::AbstractTensorMap{T, S, 1, 1}) where {T, S}
     return d
 end
 
-Base.similar(d::DiagonalTensorMap) = similar_diagonal(d)
-Base.similar(d::DiagonalTensorMap, ::Type{T}) where {T} = similar_diagonal(d, T)
-
-similar_diagonal(d::DiagonalTensorMap) = DiagonalTensorMap(similar(d.data), d.domain)
-similar_diagonal(d::DiagonalTensorMap, ::Type{T}) where {T <: Number} =
-    DiagonalTensorMap(similar(d.data, T), d.domain)
+Base.similar(d::DiagonalTensorMap) = DiagonalTensorMap(similar(d.data), d.domain)
+function Base.similar(d::DiagonalTensorMap, ::Type{T}) where {T <: Number}
+    return DiagonalTensorMap(similar(d.data, T), d.domain)
+end
 
 # TODO: more constructors needed?
 
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index 44e7be002..caae1d488 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -270,20 +270,11 @@ function _norm(blockiter, p::Real, init::Real)
         return mapreduce(max, blockiter; init = init) do (c, b)
             return isempty(b) ? init : oftype(init, LinearAlgebra.normInf(b))
         end
-    elseif p == 2
-        n² = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+    elseif p > 0 # finite positive p
+        np = sum(blockiter; init) do (c, b)
+            return oftype(init, dim(c) * norm(b, p)^p)
         end
-        return sqrt(n²)
-    elseif p == 1
-        return mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * sum(abs, b))
-        end
-    elseif p > 0
-        nᵖ = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
-        end
-        return (nᵖ)^inv(oftype(nᵖ, p))
+        return np^(inv(oftype(np, p)))
     else
         msg = "Norm with non-positive p is not defined for `AbstractTensorMap`"
         throw(ArgumentError(msg))
@@ -299,7 +290,7 @@ function LinearAlgebra.rank(
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
     S = LinearAlgebra.svdvals(t)
-    tol = max(atol, rtol * maximum(first, values(S)))
+    tol = max(atol, rtol * maximum(parent(S)))
     for (c, b) in pairs(S)
         if !isempty(b)
             r += dim(c) * count(>(tol), b)
@@ -317,8 +308,8 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
             return zero(real(float(scalartype(t))))
         end
         S = LinearAlgebra.svdvals(t)
-        maxS = maximum(first, values(S))
-        minS = minimum(last, values(S))
+        maxS = maximum(parent(S))
+        minS = minimum(parent(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
     else
         throw(ArgumentError("cond currently only defined for p=2"))
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 452072a87..c6a441a12 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -198,6 +198,15 @@ TensorMap(data::AbstractArray, codom::TensorSpace, dom::TensorSpace; kwargs...)
 Tensor(data::AbstractArray, codom::TensorSpace; kwargs...) =
     TensorMap(data, codom ← one(codom); kwargs...)
 
+function project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: DenseVector{T}}
+    t = TensorMapWithStorage{T, A}(undef, V)
+    t = project_symmetric!(t, data)
+    # verify result
+    isapprox(reshape(data, dims(t)), convert(Array, t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return t
+end
+
 function TensorMapWithStorage{T, A}(
         data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))
     ) where {T, A}
@@ -209,15 +218,7 @@ function TensorMapWithStorage{T, A}(
     sectortype(V) === Trivial &&
         return tensormaptype(spacetype(V), numout(V), numin(V), A)(reshape(data, length(data)), V)
 
-    # do projection
-    t = TensorMapWithStorage{T, A}(undef, V)
-    t = project_symmetric!(t, data)
-
-    # verify result
-    isapprox(reshape(data, dims(t)), convert(Array, t); atol = tol) ||
-        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
-
-    return t
+    return project_symmetric_and_check(T, A, data, V; tol)
 end
 TensorMapWithStorage{T, A}(data::AbstractArray, codom::TensorSpace, dom::TensorSpace; kwargs...) where {T, A} =
     TensorMapWithStorage{T, A}(data, codom ← dom; kwargs...)
@@ -315,11 +316,14 @@ end
 
 for randf in (:rand, :randn, :randexp, :randisometry)
     _docstr = """
-        $randf([rng=default_rng()], [T=Float64], codomain::ProductSpace{S,N₁},
+        $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
                      domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
-        $randf([rng=default_rng()], [T=Float64], codomain ← domain) -> t
+        $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
         
     Generate a tensor `t` with entries generated by `$randf`.
+    The type `TorA` can be used to control the element type and
+    data type generated. For example, if `TorA` is a `SparseVector{ComplexF32}`,
+    then the final output `TensorMap` will have that as its storage type.
 
     See also [`Random.$(randf)!`](@ref).
     """
@@ -348,25 +352,25 @@ for randf in (:rand, :randn, :randexp, :randisometry)
             return $randfun(codomain ← domain)
         end
         function $randfun(
-                ::Type{T}, codomain::TensorSpace{S}, domain::TensorSpace{S}
-            ) where {T, S <: IndexSpace}
-            return $randfun(T, codomain ← domain)
+                ::Type{TorA}, codomain::TensorSpace{S}, domain::TensorSpace{S}
+            ) where {TorA, S <: IndexSpace}
+            return $randfun(TorA, codomain ← domain)
         end
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, codomain::TensorSpace{S}, domain::TensorSpace{S}
-            ) where {T, S <: IndexSpace}
-            return $randfun(rng, T, codomain ← domain)
+                rng::Random.AbstractRNG, ::Type{TorA}, codomain::TensorSpace{S}, domain::TensorSpace{S}
+            ) where {TorA, S <: IndexSpace}
+            return $randfun(rng, TorA, codomain ← domain)
         end
 
         # accepting single `TensorSpace`
         $randfun(codomain::TensorSpace) = $randfun(codomain ← one(codomain))
-        function $randfun(::Type{T}, codomain::TensorSpace) where {T}
-            return $randfun(T, codomain ← one(codomain))
+        function $randfun(::Type{TorA}, codomain::TensorSpace) where {TorA}
+            return $randfun(TorA, codomain ← one(codomain))
         end
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, codomain::TensorSpace
-            ) where {T}
-            return $randfun(rng, T, codomain ← one(domain))
+                rng::Random.AbstractRNG, ::Type{TorA}, codomain::TensorSpace
+            ) where {TorA}
+            return $randfun(rng, TorA, codomain ← one(domain))
         end
 
         # filling in default eltype
@@ -376,16 +380,16 @@ for randf in (:rand, :randn, :randexp, :randisometry)
         end
 
         # filling in default rng
-        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
-            return $randfun(Random.default_rng(), T, V)
+        function $randfun(::Type{TorA}, V::TensorMapSpace) where {TorA}
+            return $randfun(Random.default_rng(), TorA, V)
         end
         $randfun!(t::AbstractTensorMap) = $randfun!(Random.default_rng(), t)
 
         # implementation
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, V::TensorMapSpace
-            ) where {T}
-            t = TensorMap{T}(undef, V)
+                rng::Random.AbstractRNG, ::Type{TorA}, V::TensorMapSpace
+            ) where {TorA}
+            t = tensormaptype(spacetype(V), numout(V), numin(V), TorA)(undef, V)
             $randfun!(rng, t)
             return t
         end
@@ -405,6 +409,10 @@ Base.copy(t::TensorMap) = typeof(t)(copy(t.data), t.space)
 
 # Conversion between TensorMap and Dict, for read and write purpose
 #------------------------------------------------------------------
+# We want to store the block data using simple data types,
+# rather tha reshaped views or some other wrapped array type.
+# Since this method is meant for storing data on disk, we can
+# freely collect data to the CPU
 function Base.convert(::Type{Dict}, t::AbstractTensorMap)
     d = Dict{Symbol, Any}()
     d[:codomain] = repr(codomain(t))
@@ -521,6 +529,11 @@ function Base.convert(::Type{TensorMap}, t::AbstractTensorMap)
     return copy!(TensorMap{scalartype(t)}(undef, space(t)), t)
 end
 
+function Base.convert(::Type{TensorMapWithStorage{T, A}}, t::TensorMap) where {T, A}
+    d_data = convert(A, t.data)
+    return TensorMapWithStorage{T, A}(d_data, space(t))
+end
+
 function Base.convert(
         TT::Type{TensorMap{T, S, N₁, N₂, A}}, t::AbstractTensorMap{<:Any, S, N₁, N₂}
     ) where {T, S, N₁, N₂, A}
@@ -535,7 +548,7 @@ end
 function Base.promote_rule(
         ::Type{<:TT₁}, ::Type{<:TT₂}
     ) where {S, N₁, N₂, TT₁ <: TensorMap{<:Any, S, N₁, N₂}, TT₂ <: TensorMap{<:Any, S, N₁, N₂}}
-    A = VectorInterface.promote_add(storagetype(TT₁), storagetype(TT₂))
-    T = scalartype(A)
-    return TensorMap{T, S, N₁, N₂, A}
+    T = VectorInterface.promote_add(scalartype(TT₁), scalartype(TT₂))
+    A = promote_storagetype(similarstoragetype(TT₁, T), similarstoragetype(TT₂, T))
+    return tensormaptype(S, N₁, N₂, A)
 end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
new file mode 100644
index 000000000..678d7fc2a
--- /dev/null
+++ b/test/cuda/tensors.jl
@@ -0,0 +1,604 @@
+using Adapt, CUDA, cuTENSOR
+using Test, TestExtras
+using TensorKit, Combinatorics
+ad = adapt(Array)
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+const curand = getglobal(CUDAExt, :curand)
+const curandn = getglobal(CUDAExt, :curandn)
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂) #, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            # test default pass-throughs
+            for f in (CUDA.zeros, CUDA.ones, curand, curandn)
+                t = @constinferred f(W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
+            for f in (rand, randn)
+                t = @constinferred f(CuVector{Float64, CUDA.DeviceMemory}, W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred CUDA.zeros(T, W)
+                CUDA.@allowscalar begin
+                    @test @constinferred(hash(t)) == hash(deepcopy(t))
+                end
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, CuVector{T, CUDA.DeviceMemory}}
+                # blocks
+                bs = @constinferred blocks(t)
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t, first(blocksectors(t)))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test typeof(c) === sectortype(t)
+            end
+        end
+        @timedtestset "Conversion to/from host" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                h_t = @constinferred rand(T, W)
+                t1 = convert(CuTensorMap{T}, h_t)
+                @test collect(t1.data) == h_t.data
+                @test space(t1) == space(h_t)
+                @test scalartype(t1) == T
+                @test codomain(t1) == W
+                @test space(t1) == (W ← one(W))
+                @test domain(t1) == one(W)
+                t2 = CuTensorMap(h_t)
+                @test collect(t2.data) == h_t.data
+                @test space(t2) == space(h_t)
+                @test scalartype(t2) == T
+                @test codomain(t2) == W
+                @test space(t2) == (W ← one(W))
+                @test domain(t2) == one(W)
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                d = convert(Dict, t)
+                @test collect(t) == convert(TensorMap, d)
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                # blocks for adjoint
+                bs = @constinferred blocks(t')
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W'))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t', first(blocksectors(t')))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test typeof(c) === sectortype(t)
+                # linear algebra
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                #@test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2)) # broken for Irrep[CU₁]
+                @test dot(β * t2, α * t) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                i1 = @constinferred(isomorphism(T, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuVector{T}, V2 ⊗ V1, V1 ⊗ V2))
+                CUDA.@allowscalar begin
+                    @test i1 * i2 == @constinferred(id(T, V1 ⊗ V2))
+                    @test i2 * i1 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1))
+                    w = @constinferred(isometry(T, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                    @test dim(w) == 2 * dim(V1 ← V1)
+                    @test w' * w == id(CuVector{T, CUDA.DeviceMemory}, V1)
+                    @test w * w' == (w * w')^2
+                end
+            end
+        end
+        @timedtestset "Trivial space insertion and removal" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                t2 = @constinferred insertleftunit(t)
+                @test t2 == @constinferred insertrightunit(t)
+                @test numind(t2) == numind(t) + 1
+                @test space(t2) == insertleftunit(space(t))
+                @test scalartype(t2) === T
+                @test t.data === t2.data
+                @test @constinferred(removeunit(t2, $(numind(t2)))) == t
+                t3 = @constinferred insertleftunit(t; copy = true)
+                @test t3 == @constinferred insertrightunit(t; copy = true)
+                @test t.data !== t3.data
+                for (c, b) in blocks(t)
+                    @test b == block(t3, c)
+                end
+                @test @constinferred(removeunit(t3, $(numind(t3)))) == t
+                t4 = @constinferred insertrightunit(t, 3; dual = true)
+                @test numin(t4) == numin(t) && numout(t4) == numout(t) + 1
+                for (c, b) in blocks(t)
+                    @test b == block(t4, c)
+                end
+                @test @constinferred(removeunit(t4, 4)) == t
+                t5 = @constinferred insertleftunit(t, 4; dual = true)
+                @test numin(t5) == numin(t) + 1 && numout(t5) == numout(t)
+                for (c, b) in blocks(t)
+                    @test b == block(t5, c)
+                end
+                @test @constinferred(removeunit(t5, 4)) == t
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = CUDA.rand(T, W)
+                    t2 = @constinferred CUDA.rand!(similar(t))
+                    α = rand(T)
+                    @test norm(t, 2) ≈ norm(collect(t), 2)
+                    @test dot(t2, t) ≈ dot(collect(t2), collect(t))
+                    @test collect(α * t) ≈ α * collect(t)
+                    @test collect(t + t) ≈ 2 * collect(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred CUDA.randn(T, W, W)
+
+                    tr = @constinferred real(t)
+                    @test scalartype(tr) <: Real
+                    @test real(collect(t)) == collect(tr)
+                    @test storagetype(tr) == CuVector{real(T), CUDA.DeviceMemory}
+
+                    ti = @constinferred imag(t)
+                    @test scalartype(ti) <: Real
+                    @test imag(collect(t)) == collect(ti)
+                    @test storagetype(ti) == CuVector{real(T), CUDA.DeviceMemory}
+
+                    tc = @inferred complex(t)
+                    @test scalartype(tc) <: Complex
+                    @test complex(collect(t)) == collect(tc)
+                    @test storagetype(tc) == CuVector{complex(T), CUDA.DeviceMemory}
+
+                    tc2 = @inferred complex(tr, ti)
+                    @test tc2 ≈ tc
+                    @test storagetype(tc2) == CuVector{complex(T), CUDA.DeviceMemory}
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin # TODO adjoint conversion methods don't work yet
+            W = V1 ⊗ V2
+            t = @constinferred CUDA.randn(W ← W)
+            @test typeof(convert(TensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = CUDA.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use CUDA here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = CUDA.rand(ComplexF64, W)
+            t′ = CUDA.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    CUDA.@allowscalar begin
+                        t2 = @constinferred permute(t, (p1, p2))
+                        t2 = permute(t, (p1, p2))
+                        @test norm(t2) ≈ norm(t)
+                        t2′ = permute(t′, (p1, p2))
+                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    end
+                end
+
+                CUDA.@allowscalar begin
+                    t3 = @constinferred repartition(t, $k)
+                    t3 = repartition(t, k)
+                    @test norm(t3) ≈ norm(t)
+                    t3′ = @constinferred repartition!(similar(t3), t′)
+                    @test norm(t3′) ≈ norm(t′)
+                    @test dot(t′, t) ≈ dot(t3′, t3)
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = CUDA.rand(ComplexF64, W)
+                a = convert(Array, collect(t))
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = CUDA.@allowscalar permute(t, (p1, p2))
+                        a2 = convert(Array, collect(t2))
+                        @test a2 ≈ permutedims(a, (p1..., p2...))
+                        @test convert(Array, collect(transpose(t2))) ≈
+                            permutedims(a2, (5, 4, 3, 2, 1))
+                    end
+
+                    t3 = CUDA.@allowscalar repartition(t, k)
+                    a3 = convert(Array, collect(t3))
+                    @test a3 ≈ permutedims(
+                        a, (ntuple(identity, k)..., reverse(ntuple(i -> i + k, 5 - k))...)
+                    )
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            CUDA.@allowscalar begin
+                t2 = permute(t, ((1, 2), (4, 3)))
+                s = @constinferred tr(t2)
+                @test conj(s) ≈ tr(t2')
+                if !isdual(V1)
+                    t2 = twist!(t2, 1)
+                end
+                if isdual(V2)
+                    t2 = twist!(t2, 2)
+                end
+                ss = tr(t2)
+                @tensor s2 = t[a, b, b, a]
+                @tensor t3[a, b] := t[a, c, c, b]
+                @tensor s3 = t3[a, a]
+            end
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                CUDA.@allowscalar begin
+                    @tensor t2[a, b] := t[c, d, b, d, c, a]
+                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                end
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            CUDA.@allowscalar begin
+                t3 = t1 ⊗ t2
+                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+                @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            end
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via CPU" begin
+                dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
+                dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
+                drhoL = CUDA.randn(ComplexF64, V1, V1)
+                drhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                dH = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
+                    dA2[b, t2, c'] * drhoR[c', c] *
+                    dH[s1, s2, t1, t2]
+                @tensor hHrA12[a, s1, s2, c] := collect(drhoL)[a, a'] * conj(collect(dA1)[a', t1, b]) *
+                    collect(dA2)[b, t2, c'] * collect(drhoR)[c', c] *
+                    collect(dH)[s1, s2, t1, t2]
+                @test collect(dHrA12) ≈ hHrA12
+            end
+        end
+        @timedtestset "Index flipping: test flipping inverse" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            for i in 1:4
+                CUDA.@allowscalar begin
+                    @test t ≈ flip(flip(t, i), i; inv = true)
+                    @test t ≈ flip(flip(t, i; inv = true), i)
+                end
+            end
+        end
+        #=@timedtestset "Index flipping: test via explicit flip" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            F1 = unitary(flip(V1), V1)
+
+            CUDA.@allowscalar begin
+                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+                @test flip(t, 1) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+                @test twist!(flip(t, 2), 2) ≈ tf
+                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+                @test flip(t, 3) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+                @test twist!(flip(t, 4), 4) ≈ tf
+            end
+        end
+        @timedtestset "Index flipping: test via contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            CUDA.@allowscalar begin
+                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+                @test flip(ta, (1, 2)) ≈ tb
+            end
+        end=# # TODO
+        @timedtestset "Multiplication of isometries: test properties" begin
+            W2 = V4 ⊗ V5
+            W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
+            for T in (Float64, ComplexF64)
+                t1 = randisometry(CuMatrix{T}, W1, W2)
+                t2 = randisometry(CuMatrix{T}, W2 ← W2)
+                @test isisometric(t1)
+                @test isunitary(t2)
+                P = t1 * t1'
+                @test P * P ≈ P
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t = CUDA.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via CPU" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = CUDA.rand(T, W1, W1)
+                    t2 = CUDA.rand(T, W2, W2)
+                    t = CUDA.rand(T, W1, W2)
+                    ht1 = collect(t1)
+                    ht2 = collect(t2)
+                    ht = collect(t)
+                    @test collect(t1 * t) ≈ ht1 * ht
+                    @test collect(t1' * t) ≈ ht1' * ht
+                    @test collect(t2 * t') ≈ ht2 * ht'
+                    @test collect(t2' * t') ≈ ht2' * ht'
+
+                    @test collect(inv(t1)) ≈ inv(ht1)
+                    @test collect(pinv(t)) ≈ pinv(ht)
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test collect(t1 \ t) ≈ ht1 \ ht
+                    @test collect(t1' \ t) ≈ ht1' \ ht
+                    @test collect(t2 \ t') ≈ ht2 \ ht'
+                    @test collect(t2' \ t') ≈ ht2' \ ht'
+
+                    @test collect(t2 / t) ≈ ht2 / ht
+                    @test collect(t2' / t) ≈ ht2' / ht
+                    @test collect(t1 / t') ≈ ht1 / ht'
+                    @test collect(t1' / t') ≈ ht1' / ht'
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = CUDA.randn(T, W, W)
+                    s = dim(W)
+                    @test (@constinferred sqrt(t))^2 ≈ t
+                    #@test collect(sqrt(t)) ≈ sqrt(collect(t)) # schur not supported for CuArray
+
+                    expt = @constinferred exp(t)
+                    @test collect(expt) ≈ exp(collect(t))
+
+                    # log doesn't work on CUDA yet
+                    @test exp(@constinferred log(expt)) ≈ expt
+                    @test collect(log(expt)) ≈ log(collect(expt))
+
+                    #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)=# # TODO in CUDA
+
+                    #=t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8=#
+                    # TODO in CUDA
+                end
+            end
+        end
+        # Sylvester not defined for CUDA
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                end
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    CUDA.@allowscalar begin
+                        t = @constinferred (t1 ⊗ t2)
+                        At = ad(t)
+                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                    end
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                    # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
+                end
+            end
+        end
+    end
+    TensorKit.empty_globalcaches!()
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = CUDA.rand(T, W2, W1 ⊗ W1')
+            CUDA.@allowscalar begin
+                t = @constinferred (t1 ⊠ t2)
+            end
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            CUDA.@allowscalar begin
+                @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 854991a7f..9513378db 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,8 +2,6 @@
 # ------------
 using ArgParse: ArgParse
 using SafeTestsets: @safetestset
-# using CUDA: CUDA
-# using AMDGPU: AMDGPU
 
 function parse_commandline(args = ARGS)
     s = ArgParse.ArgParseSettings()
@@ -49,9 +47,9 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
 
     # handle GPU cases separately
     if group == "cuda"
-        # CUDA.functional() || continue
-    elseif group == "amd"
-        # AMDGPU.functional() || continue
+        using CUDA
+        CUDA.functional() || continue
+        @time include("cuda/tensors.jl")
     elseif is_buildkite
         continue
     end

From b174787b3f2dade736ecddd45bd7c1d86f287a20 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 18 Dec 2025 06:09:00 -0500
Subject: [PATCH 02/19] More small fixes

---
 test/cuda/tensors.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 678d7fc2a..3a133c9cf 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -155,9 +155,9 @@ for V in spacelist
                 i1 = @constinferred(isomorphism(T, V1 ⊗ V2, V2 ⊗ V1))
                 i2 = @constinferred(isomorphism(CuVector{T}, V2 ⊗ V1, V1 ⊗ V2))
                 CUDA.@allowscalar begin
-                    @test i1 * i2 == @constinferred(id(T, V1 ⊗ V2))
+                    @test i1 * i2 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2))
                     @test i2 * i1 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1))
-                    w = @constinferred(isometry(T, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                    w = @constinferred(isometry(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
                     @test dim(w) == 2 * dim(V1 ← V1)
                     @test w' * w == id(CuVector{T, CUDA.DeviceMemory}, V1)
                     @test w * w' == (w * w')^2
@@ -353,7 +353,7 @@ for V in spacelist
             end
             @test ta ≈ tb
         end
-        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Tensor contraction: test via CPU" begin
                 dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
                 dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
@@ -368,7 +368,7 @@ for V in spacelist
                     collect(dH)[s1, s2, t1, t2]
                 @test collect(dHrA12) ≈ hHrA12
             end
-        end
+        end=# # doesn't yet work because of AdjointTensor
         @timedtestset "Index flipping: test flipping inverse" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             for i in 1:4
@@ -478,15 +478,15 @@ for V in spacelist
                 for T in (Float64, ComplexF64)
                     t = CUDA.randn(T, W, W)
                     s = dim(W)
-                    @test (@constinferred sqrt(t))^2 ≈ t
-                    #@test collect(sqrt(t)) ≈ sqrt(collect(t)) # schur not supported for CuArray
+                    @test_broken (@constinferred sqrt(t))^2 ≈ t
+                    @test_broken collect(sqrt(t)) ≈ sqrt(collect(t))
 
                     expt = @constinferred exp(t)
-                    @test collect(expt) ≈ exp(collect(t))
+                    @test_broken collect(expt) ≈ exp(collect(t))
 
-                    # log doesn't work on CUDA yet
-                    @test exp(@constinferred log(expt)) ≈ expt
-                    @test collect(log(expt)) ≈ log(collect(expt))
+                    # log doesn't work on CUDA yet (scalar indexing)
+                    #@test exp(@constinferred log(expt)) ≈ expt
+                    #@test collect(log(expt)) ≈ log(collect(expt))
 
                     #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                           id(storagetype(t), W)

From 7da48358a184667ecc4e1768f5025b55aba50bee Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 18 Dec 2025 08:59:01 -0500
Subject: [PATCH 03/19] move collect in

---
 ext/TensorKitCUDAExt/cutensormap.jl |  4 ----
 src/tensors/tensor.jl               |  4 ++++
 test/cuda/tensors.jl                | 28 ++++++++++++++--------------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 5ae602cfa..9496a74c1 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -7,10 +7,6 @@ function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂,
     return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
 end
 
-function Base.collect(t::CuTensorMap{T}) where {T}
-    return convert(TensorKit.TensorMapWithStorage{T, Vector{T}}, t)
-end
-
 # project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
 function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
     h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index c6a441a12..0f6b2f75d 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -403,6 +403,10 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
+# Collecting arbitrary TensorMaps  
+#-----------------------------
+Base.collect(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
+
 # Efficient copy constructors
 #-----------------------------
 Base.copy(t::TensorMap) = typeof(t)(copy(t.data), t.space)
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 3a133c9cf..9ce69fdfe 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -152,16 +152,14 @@ for V in spacelist
                 @test dot(t2, t) ≈ conj(dot(t2', t'))
                 @test dot(t2, t) ≈ dot(t', t2')
 
-                i1 = @constinferred(isomorphism(T, V1 ⊗ V2, V2 ⊗ V1))
-                i2 = @constinferred(isomorphism(CuVector{T}, V2 ⊗ V1, V1 ⊗ V2))
-                CUDA.@allowscalar begin
-                    @test i1 * i2 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2))
-                    @test i2 * i1 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1))
-                    w = @constinferred(isometry(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
-                    @test dim(w) == 2 * dim(V1 ← V1)
-                    @test w' * w == id(CuVector{T, CUDA.DeviceMemory}, V1)
-                    @test w * w' == (w * w')^2
-                end
+                i1 = @constinferred(isomorphism(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1))
+                w = @constinferred(isometry(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuVector{T, CUDA.DeviceMemory}, V1)
+                @test w * w' == (w * w')^2
             end
         end
         @timedtestset "Trivial space insertion and removal" begin
@@ -238,11 +236,11 @@ for V in spacelist
         @timedtestset "Tensor conversion" begin # TODO adjoint conversion methods don't work yet
             W = V1 ⊗ V2
             t = @constinferred CUDA.randn(W ← W)
-            @test typeof(convert(TensorMap, t')) == typeof(t)
+            #@test typeof(convert(TensorMap, t')) == typeof(t) # TODO Adjoint not supported yet
             tc = complex(t)
             @test convert(typeof(tc), t) == tc
             @test typeof(convert(typeof(tc), t)) == typeof(tc)
-            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            # @test typeof(convert(typeof(tc), t')) == typeof(tc) # TODO Adjoint not supported yet
             @test Base.promote_typeof(t, tc) == typeof(tc)
             @test Base.promote_typeof(tc, t) == typeof(tc + t)
         end
@@ -294,8 +292,10 @@ for V in spacelist
                         t2 = CUDA.@allowscalar permute(t, (p1, p2))
                         a2 = convert(Array, collect(t2))
                         @test a2 ≈ permutedims(a, (p1..., p2...))
-                        @test convert(Array, collect(transpose(t2))) ≈
-                            permutedims(a2, (5, 4, 3, 2, 1))
+                        CUDA.@allowscalar begin
+                            @test convert(Array, collect(transpose(t2))) ≈
+                                permutedims(a2, (5, 4, 3, 2, 1))
+                        end
                     end
 
                     t3 = CUDA.@allowscalar repartition(t, k)

From e4f6d824907147157488ff4f4ee14ec5e50294e7 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 18 Dec 2025 09:47:33 -0500
Subject: [PATCH 04/19] More test via CPU

---
 src/tensors/tensor.jl |  2 +-
 test/cuda/tensors.jl  | 18 ++++++------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 0f6b2f75d..a7364435d 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -403,7 +403,7 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
-# Collecting arbitrary TensorMaps  
+# Collecting arbitrary TensorMaps
 #-----------------------------
 Base.collect(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
 
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 9ce69fdfe..ac8fe36d1 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -289,20 +289,14 @@ for V in spacelist
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)
                         p2 = ntuple(n -> p[k + n], 5 - k)
-                        t2 = CUDA.@allowscalar permute(t, (p1, p2))
-                        a2 = convert(Array, collect(t2))
-                        @test a2 ≈ permutedims(a, (p1..., p2...))
-                        CUDA.@allowscalar begin
-                            @test convert(Array, collect(transpose(t2))) ≈
-                                permutedims(a2, (5, 4, 3, 2, 1))
-                        end
+                        dt2 = CUDA.@allowscalar permute(t, (p1, p2))
+                        ht2 = permute(collect(t), (p1, p2))
+                        @test ht2 == collect(dt2)
                     end
 
-                    t3 = CUDA.@allowscalar repartition(t, k)
-                    a3 = convert(Array, collect(t3))
-                    @test a3 ≈ permutedims(
-                        a, (ntuple(identity, k)..., reverse(ntuple(i -> i + k, 5 - k))...)
-                    )
+                    dt3 = CUDA.@allowscalar repartition(t, k)
+                    ht3 = repartition(collect(t), k)
+                    @test ht3 == collect(dt3)
                 end
             end
         end

From a384cdc03582e922bc41bd1cef412678ebbd3594 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kslimes@gmail.com>
Date: Fri, 19 Dec 2025 12:06:51 +0100
Subject: [PATCH 05/19] Update example

---
 src/tensors/tensor.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index a7364435d..8bd110cc4 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -322,8 +322,9 @@ for randf in (:rand, :randn, :randexp, :randisometry)
         
     Generate a tensor `t` with entries generated by `$randf`.
     The type `TorA` can be used to control the element type and
-    data type generated. For example, if `TorA` is a `SparseVector{ComplexF32}`,
-    then the final output `TensorMap` will have that as its storage type.
+    data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
+    or `ROCVector{Float64}`, then the final output `TensorMap` will have
+    that as its storage type.
 
     See also [`Random.$(randf)!`](@ref).
     """

From 084f2ee04588a85f104297261c274400cf93b1a5 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 19 Dec 2025 06:57:14 -0500
Subject: [PATCH 06/19] Comments

---
 ext/TensorKitCUDAExt/TensorKitCUDAExt.jl |  2 +-
 ext/TensorKitCUDAExt/cutensormap.jl      | 13 +++++++++++--
 src/tensors/linalg.jl                    |  4 ++--
 test/cuda/tensors.jl                     | 11 +++++++++++
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index 0562acf61..417970a02 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -12,7 +12,7 @@ using TensorKit.Factorizations: AbstractAlgorithm
 using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
 import TensorKit: randisometry, rand, randn
 
-using TensorKit.MatrixAlgebraKit
+using TensorKit: MatrixAlgebraKit
 
 using Random
 
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index 9496a74c1..e17547789 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -84,6 +84,13 @@ for randfun in (:curand, :curandn)
             $randfun!(rng, t)
             return t
         end
+
+        function $randfun!(rng::Random.AbstractRNG, t::CuTensorMap)
+            for (_, b) in blocks(t)
+                $randfun!(rng, b)
+            end
+            return t
+        end
     end
 end
 
@@ -112,7 +119,7 @@ function LinearAlgebra.isposdef(t::CuTensorMap)
     InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
     for (c, b) in blocks(t)
         # do our own hermitian check
-        isherm = TensorKit.MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm = MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
         isherm || return false
         isposdef(Hermitian(b)) || return false
     end
@@ -135,6 +142,7 @@ end
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
         error("Exponential of a tensor only exist when domain == codomain.")
+    !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`exp!` is only supported on hermitian CUDA tensors"))
     for (c, b) in blocks(t)
         copy!(b, parent(Base.exp(Hermitian(b))))
     end
@@ -146,7 +154,8 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     sf = string(f)
     @eval function Base.$f(t::CuTensorMap)
         domain(t) == codomain(t) ||
-            throw(SpaceMismatch("`$($sf)` of a tensor only exist when domain == codomain"))
+            throw(SpaceMismatch("`$($sf)` of a tensor only exists when domain == codomain"))
+        !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`$($sf)` is only supported on hermitian CUDA tensors"))
         T = complex(float(scalartype(t)))
         tf = similar(t, T)
         for (c, b) in blocks(t)
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index caae1d488..cc19ceaef 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -289,7 +289,7 @@ function LinearAlgebra.rank(
     )
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
-    S = LinearAlgebra.svdvals(t)
+    S = svd_vals(t)
     tol = max(atol, rtol * maximum(parent(S)))
     for (c, b) in pairs(S)
         if !isempty(b)
@@ -307,7 +307,7 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
                 throw(SpaceMismatch("`cond` requires domain and codomain to be the same"))
             return zero(real(float(scalartype(t))))
         end
-        S = LinearAlgebra.svdvals(t)
+        S = svd_vals(t)
         maxS = maximum(parent(S))
         minS = minimum(parent(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index ac8fe36d1..67981b5ae 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -7,6 +7,8 @@ const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
 const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
 const curand = getglobal(CUDAExt, :curand)
 const curandn = getglobal(CUDAExt, :curandn)
+const curand! = getglobal(CUDAExt, :curand!)
+const curandn! = getglobal(CUDAExt, :curandn!)
 
 @isdefined(TestSetup) || include("../setup.jl")
 using .TestSetup
@@ -61,6 +63,15 @@ for V in spacelist
                 @test domain(t) == one(W)
                 @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
             end
+            for f! in (curand!, curandn!)
+                t = @constinferred CUDA.zeros(W)
+                f!(t)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
             for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
                 t = @constinferred CUDA.zeros(T, W)
                 CUDA.@allowscalar begin

From bd1966c0ff93b3dd318887e6eff0e84457bc84fd Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 19 Dec 2025 07:02:31 -0500
Subject: [PATCH 07/19] Fixup diagonal

---
 src/tensors/diagonal.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index 356f5dbf4..e35c2ffdd 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -78,10 +78,12 @@ function DiagonalTensorMap(t::AbstractTensorMap{T, S, 1, 1}) where {T, S}
     return d
 end
 
-Base.similar(d::DiagonalTensorMap) = DiagonalTensorMap(similar(d.data), d.domain)
-function Base.similar(d::DiagonalTensorMap, ::Type{T}) where {T <: Number}
-    return DiagonalTensorMap(similar(d.data, T), d.domain)
-end
+Base.similar(d::DiagonalTensorMap) = similar_diagonal(d)
+Base.similar(d::DiagonalTensorMap, ::Type{T}) where {T} = similar_diagonal(d, T)
+
+similar_diagonal(d::DiagonalTensorMap) = DiagonalTensorMap(similar(d.data), d.domain)
+similar_diagonal(d::DiagonalTensorMap, ::Type{T}) where {T <: Number} =
+    DiagonalTensorMap(similar(d.data, T), d.domain)
 
 # TODO: more constructors needed?
 

From 832ed15681fdba4c384c2fe22cda253182ec323f Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 19 Dec 2025 10:08:59 -0500
Subject: [PATCH 08/19] More comments

---
 ext/TensorKitCUDAExt/cutensormap.jl |  4 +-
 src/tensors/linalg.jl               |  4 +-
 src/tensors/tensor.jl               |  5 +-
 test/cuda/tensors.jl                | 78 ++++++++++++++---------------
 4 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index e17547789..b4aa7f7a2 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -142,7 +142,7 @@ end
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
         error("Exponential of a tensor only exist when domain == codomain.")
-    !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`exp!` is only supported on hermitian CUDA tensors"))
+    !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`exp!` is currently only supported on hermitian CUDA tensors"))
     for (c, b) in blocks(t)
         copy!(b, parent(Base.exp(Hermitian(b))))
     end
@@ -155,7 +155,7 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
     @eval function Base.$f(t::CuTensorMap)
         domain(t) == codomain(t) ||
             throw(SpaceMismatch("`$($sf)` of a tensor only exists when domain == codomain"))
-        !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`$($sf)` is only supported on hermitian CUDA tensors"))
+        !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`$($sf)` is currently only supported on hermitian CUDA tensors"))
         T = complex(float(scalartype(t)))
         tf = similar(t, T)
         for (c, b) in blocks(t)
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index cc19ceaef..4b2c7d4ee 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -289,7 +289,7 @@ function LinearAlgebra.rank(
     )
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
-    S = svd_vals(t)
+    S = MatrixAlgebraKit.svd_vals(t)
     tol = max(atol, rtol * maximum(parent(S)))
     for (c, b) in pairs(S)
         if !isempty(b)
@@ -307,7 +307,7 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
                 throw(SpaceMismatch("`cond` requires domain and codomain to be the same"))
             return zero(real(float(scalartype(t))))
         end
-        S = svd_vals(t)
+        S = MatrixAlgebraKit.svd_vals(t)
         maxS = maximum(parent(S))
         minS = minimum(parent(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 8bd110cc4..d8118bcd7 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -404,9 +404,10 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
-# Collecting arbitrary TensorMaps
+# Moving arbitrary TensorMaps to CPU
 #-----------------------------
-Base.collect(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
+to_cpu(t::TensorMapWithStorage{T, Vector{T}}) where {T} = t # no op
+to_cpu(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
 
 # Efficient copy constructors
 #-----------------------------
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 67981b5ae..f453bcda2 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -120,7 +120,7 @@ for V in spacelist
             for T in (Int, Float32, ComplexF64)
                 t = @constinferred CUDA.rand(T, W)
                 d = convert(Dict, t)
-                @test collect(t) == convert(TensorMap, d)
+                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
             end
         end
         @timedtestset "Basic linear algebra" begin
@@ -212,10 +212,10 @@ for V in spacelist
                     t = CUDA.rand(T, W)
                     t2 = @constinferred CUDA.rand!(similar(t))
                     α = rand(T)
-                    @test norm(t, 2) ≈ norm(collect(t), 2)
-                    @test dot(t2, t) ≈ dot(collect(t2), collect(t))
-                    @test collect(α * t) ≈ α * collect(t)
-                    @test collect(t + t) ≈ 2 * collect(t)
+                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
+                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
+                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
+                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -225,17 +225,17 @@ for V in spacelist
 
                     tr = @constinferred real(t)
                     @test scalartype(tr) <: Real
-                    @test real(collect(t)) == collect(tr)
+                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
                     @test storagetype(tr) == CuVector{real(T), CUDA.DeviceMemory}
 
                     ti = @constinferred imag(t)
                     @test scalartype(ti) <: Real
-                    @test imag(collect(t)) == collect(ti)
+                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
                     @test storagetype(ti) == CuVector{real(T), CUDA.DeviceMemory}
 
                     tc = @inferred complex(t)
                     @test scalartype(tc) <: Complex
-                    @test complex(collect(t)) == collect(tc)
+                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
                     @test storagetype(tc) == CuVector{complex(T), CUDA.DeviceMemory}
 
                     tc2 = @inferred complex(tr, ti)
@@ -295,19 +295,19 @@ for V in spacelist
             @timedtestset "Permutations: test via CPU" begin
                 W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
                 t = CUDA.rand(ComplexF64, W)
-                a = convert(Array, collect(t))
+                a = convert(Array, TensorKit.to_cpu(t))
                 for k in 0:5
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)
                         p2 = ntuple(n -> p[k + n], 5 - k)
                         dt2 = CUDA.@allowscalar permute(t, (p1, p2))
-                        ht2 = permute(collect(t), (p1, p2))
-                        @test ht2 == collect(dt2)
+                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
+                        @test ht2 == TensorKit.to_cpu(dt2)
                     end
 
                     dt3 = CUDA.@allowscalar repartition(t, k)
-                    ht3 = repartition(collect(t), k)
-                    @test ht3 == collect(dt3)
+                    ht3 = repartition(TensorKit.to_cpu(t), k)
+                    @test ht3 == TensorKit.to_cpu(dt3)
                 end
             end
         end
@@ -368,10 +368,10 @@ for V in spacelist
                 @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
                     dA2[b, t2, c'] * drhoR[c', c] *
                     dH[s1, s2, t1, t2]
-                @tensor hHrA12[a, s1, s2, c] := collect(drhoL)[a, a'] * conj(collect(dA1)[a', t1, b]) *
-                    collect(dA2)[b, t2, c'] * collect(drhoR)[c', c] *
-                    collect(dH)[s1, s2, t1, t2]
-                @test collect(dHrA12) ≈ hHrA12
+                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
+                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
+                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
+                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
             end
         end=# # doesn't yet work because of AdjointTensor
         @timedtestset "Index flipping: test flipping inverse" begin
@@ -450,48 +450,48 @@ for V in spacelist
                     t1 = CUDA.rand(T, W1, W1)
                     t2 = CUDA.rand(T, W2, W2)
                     t = CUDA.rand(T, W1, W2)
-                    ht1 = collect(t1)
-                    ht2 = collect(t2)
-                    ht = collect(t)
-                    @test collect(t1 * t) ≈ ht1 * ht
-                    @test collect(t1' * t) ≈ ht1' * ht
-                    @test collect(t2 * t') ≈ ht2 * ht'
-                    @test collect(t2' * t') ≈ ht2' * ht'
+                    ht1 = TensorKit.to_cpu(t1)
+                    ht2 = TensorKit.to_cpu(t2)
+                    ht = TensorKit.to_cpu(t)
+                    @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
+                    @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
+                    @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
+                    @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
 
-                    @test collect(inv(t1)) ≈ inv(ht1)
-                    @test collect(pinv(t)) ≈ pinv(ht)
+                    @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
+                    @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
 
                     if T == Float32 || T == ComplexF32
                         continue
                     end
 
-                    @test collect(t1 \ t) ≈ ht1 \ ht
-                    @test collect(t1' \ t) ≈ ht1' \ ht
-                    @test collect(t2 \ t') ≈ ht2 \ ht'
-                    @test collect(t2' \ t') ≈ ht2' \ ht'
+                    @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
+                    @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
+                    @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
+                    @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
 
-                    @test collect(t2 / t) ≈ ht2 / ht
-                    @test collect(t2' / t) ≈ ht2' / ht
-                    @test collect(t1 / t') ≈ ht1 / ht'
-                    @test collect(t1' / t') ≈ ht1' / ht'
+                    @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
+                    @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
+                    @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
+                    @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
                 end
             end
         end
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
-            @timedtestset "Tensor functions" begin
+            #=@timedtestset "Tensor functions" begin
                 W = V1 ⊗ V2
                 for T in (Float64, ComplexF64)
                     t = CUDA.randn(T, W, W)
                     s = dim(W)
                     @test_broken (@constinferred sqrt(t))^2 ≈ t
-                    @test_broken collect(sqrt(t)) ≈ sqrt(collect(t))
+                    @test_broken TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
 
                     expt = @constinferred exp(t)
-                    @test_broken collect(expt) ≈ exp(collect(t))
+                    @test_broken TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
 
                     # log doesn't work on CUDA yet (scalar indexing)
                     #@test exp(@constinferred log(expt)) ≈ expt
-                    #@test collect(log(expt)) ≈ log(collect(expt))
+                    #@test TensorKit.to_cpu(log(expt)) ≈ log(TensorKit.to_cpu(expt))
 
                     #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                           id(storagetype(t), W)
@@ -520,7 +520,7 @@ for V in spacelist
                     @test coth(@constinferred acoth(t8)) ≈ t8=#
                     # TODO in CUDA
                 end
-            end
+            end=#
         end
         # Sylvester not defined for CUDA
         # @timedtestset "Sylvester equation" begin

From 68b282a720e998d722125bf6df38dbe3e9aab261 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:07:59 +0100
Subject: [PATCH 09/19] Update test/cuda/tensors.jl

Co-authored-by: Lukas Devos <ldevos98@gmail.com>
---
 test/cuda/tensors.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index f453bcda2..1c8125ba2 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -8,7 +8,7 @@ const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
 const curand = getglobal(CUDAExt, :curand)
 const curandn = getglobal(CUDAExt, :curandn)
 const curand! = getglobal(CUDAExt, :curand!)
-const curandn! = getglobal(CUDAExt, :curandn!)
+using CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
 
 @isdefined(TestSetup) || include("../setup.jl")
 using .TestSetup

From f188aed7a8a0b6a1b0bb63f6dc2ec322c2c82026 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:08:07 +0100
Subject: [PATCH 10/19] Update test/cuda/tensors.jl

Co-authored-by: Lukas Devos <ldevos98@gmail.com>
---
 test/cuda/tensors.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 1c8125ba2..ae979c01e 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -295,7 +295,6 @@ for V in spacelist
             @timedtestset "Permutations: test via CPU" begin
                 W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
                 t = CUDA.rand(ComplexF64, W)
-                a = convert(Array, TensorKit.to_cpu(t))
                 for k in 0:5
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)

From 329afe6342e79bdd4430e0dea10a6f6aa078328c Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:08:17 +0100
Subject: [PATCH 11/19] Update test/cuda/tensors.jl

Co-authored-by: Lukas Devos <ldevos98@gmail.com>
---
 test/cuda/tensors.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index ae979c01e..c5863ac43 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -441,7 +441,6 @@ for V in spacelist
                 @test tp ≈ tp * tp
             end
         end
-        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Multiplication and inverse: test via CPU" begin
                 W1 = V1 ⊗ V2 ⊗ V3
                 W2 = V4 ⊗ V5

From e55f3380673cdb07480be5f053eb8e9ead85942b Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <kshyatt@users.noreply.github.com>
Date: Fri, 19 Dec 2025 18:08:28 +0100
Subject: [PATCH 12/19] Update test/cuda/tensors.jl

Co-authored-by: Lukas Devos <ldevos98@gmail.com>
---
 test/cuda/tensors.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index c5863ac43..4ebf99c73 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -291,7 +291,7 @@ for V in spacelist
                 end
             end
         end
-        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        if BraidingStyle(I) isa SymmetricBraiding
             @timedtestset "Permutations: test via CPU" begin
                 W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
                 t = CUDA.rand(ComplexF64, W)

From 61ba421d1bc0c52a17802e10815faa47cfe5ac52 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 22 Dec 2025 05:39:43 -0500
Subject: [PATCH 13/19] Fix bad end

---
 test/cuda/tensors.jl | 55 ++++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 4ebf99c73..9f52161a8 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -441,38 +441,37 @@ for V in spacelist
                 @test tp ≈ tp * tp
             end
         end
-            @timedtestset "Multiplication and inverse: test via CPU" begin
-                W1 = V1 ⊗ V2 ⊗ V3
-                W2 = V4 ⊗ V5
-                for T in (Float32, Float64, ComplexF32, ComplexF64)
-                    t1 = CUDA.rand(T, W1, W1)
-                    t2 = CUDA.rand(T, W2, W2)
-                    t = CUDA.rand(T, W1, W2)
-                    ht1 = TensorKit.to_cpu(t1)
-                    ht2 = TensorKit.to_cpu(t2)
-                    ht = TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
-                    @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
-                    @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
-                    @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+        @timedtestset "Multiplication and inverse: test via CPU" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float32, Float64, ComplexF32, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t = CUDA.rand(T, W1, W2)
+                ht1 = TensorKit.to_cpu(t1)
+                ht2 = TensorKit.to_cpu(t2)
+                ht = TensorKit.to_cpu(t)
+                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
+                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
+                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
+                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
 
-                    @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
-                    @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+                @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
+                @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
 
-                    if T == Float32 || T == ComplexF32
-                        continue
-                    end
+                if T == Float32 || T == ComplexF32
+                    continue
+                end
 
-                    @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
-                    @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
-                    @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
-                    @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+                @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
+                @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
+                @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
+                @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
 
-                    @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
-                    @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
-                    @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
-                    @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
-                end
+                @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
+                @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
+                @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
+                @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
             end
         end
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)

From 1c75de55adfa3847bf53af7fb9cfb294eaf4dcc3 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 22 Dec 2025 05:53:44 -0500
Subject: [PATCH 14/19] Make format happy

---
 src/tensors/tensor.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index d8118bcd7..69467963c 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -316,15 +316,15 @@ end
 
 for randf in (:rand, :randn, :randexp, :randisometry)
     _docstr = """
-        $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
+            $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
                      domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
-        $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
+            $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
         
     Generate a tensor `t` with entries generated by `$randf`.
-    The type `TorA` can be used to control the element type and
-    data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
-    or `ROCVector{Float64}`, then the final output `TensorMap` will have
-    that as its storage type.
+        The type `TorA` can be used to control the element type and
+        data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
+        or `ROCVector{Float64}`, then the final output `TensorMap` will have
+        that as its storage type.
 
     See also [`Random.$(randf)!`](@ref).
     """

From 3949ec000b79cadbd6aed5496f5df229c582535f Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Mon, 22 Dec 2025 14:27:26 -0500
Subject: [PATCH 15/19] Small updates

---
 Project.toml         |  3 +++
 test/cuda/tensors.jl | 16 ++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/Project.toml b/Project.toml
index c0c05603f..41d8a8002 100644
--- a/Project.toml
+++ b/Project.toml
@@ -77,3 +77,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+
+[sources]
+MatrixAlgebraKit = {url="https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev="main"}
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 9f52161a8..8a89a8ad8 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -475,20 +475,20 @@ for V in spacelist
             end
         end
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
-            #=@timedtestset "Tensor functions" begin
+            @timedtestset "Tensor functions" begin
                 W = V1 ⊗ V2
                 for T in (Float64, ComplexF64)
-                    t = CUDA.randn(T, W, W)
+                    t = project_hermitian!(CUDA.randn(T, W, W))
                     s = dim(W)
-                    @test_broken (@constinferred sqrt(t))^2 ≈ t
-                    @test_broken TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+                    #@test (@constinferred sqrt(t))^2 ≈ t
+                    #@test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
 
                     expt = @constinferred exp(t)
-                    @test_broken TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+                    @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
 
                     # log doesn't work on CUDA yet (scalar indexing)
-                    #@test exp(@constinferred log(expt)) ≈ expt
-                    #@test TensorKit.to_cpu(log(expt)) ≈ log(TensorKit.to_cpu(expt))
+                    #@test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
+                    #@test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
 
                     #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
                           id(storagetype(t), W)
@@ -517,7 +517,7 @@ for V in spacelist
                     @test coth(@constinferred acoth(t8)) ≈ t8=#
                     # TODO in CUDA
                 end
-            end=#
+            end
         end
         # Sylvester not defined for CUDA
         # @timedtestset "Sylvester equation" begin

From 7172691443dd2bb01bb14d564282e6dc025da15e Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <katharine.s.hyatt@gmail.com>
Date: Sun, 28 Dec 2025 15:43:03 +0100
Subject: [PATCH 16/19] Update Project.toml

---
 Project.toml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Project.toml b/Project.toml
index 41d8a8002..f56b35ba7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -40,7 +40,7 @@ FiniteDifferences = "0.12"
 GPUArrays = "11.3.1"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
-MatrixAlgebraKit = "0.6.0"
+MatrixAlgebraKit = "0.6.1"
 OhMyThreads = "0.8.0"
 Printf = "1"
 Random = "1"
@@ -77,6 +77,3 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
-
-[sources]
-MatrixAlgebraKit = {url="https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev="main"}

From 5a9c8cdfaf3f21cc1310b8b15a06ac59efe933d2 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <katharine.s.hyatt@gmail.com>
Date: Sun, 28 Dec 2025 18:14:18 +0100
Subject: [PATCH 17/19] Try to fix spacing

---
 src/tensors/tensor.jl | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 69467963c..e60d4d936 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -316,15 +316,15 @@ end
 
 for randf in (:rand, :randn, :randexp, :randisometry)
     _docstr = """
-            $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
-                     domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
-            $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
+        $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
+                 domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
+        $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
         
     Generate a tensor `t` with entries generated by `$randf`.
-        The type `TorA` can be used to control the element type and
-        data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
-        or `ROCVector{Float64}`, then the final output `TensorMap` will have
-        that as its storage type.
+    The type `TorA` can be used to control the element type and
+    data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
+    or `ROCVector{Float64}`, then the final output `TensorMap` will have
+    that as its storage type.
 
     See also [`Random.$(randf)!`](@ref).
     """

From 658235a1dd1843d5fb80b6f3264179b728ec65e3 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <katharine.s.hyatt@gmail.com>
Date: Wed, 31 Dec 2025 12:13:42 +0100
Subject: [PATCH 18/19] one more format attempt

---
 src/tensors/tensor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index e60d4d936..d8118bcd7 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -317,7 +317,7 @@ end
 for randf in (:rand, :randn, :randexp, :randisometry)
     _docstr = """
         $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
-                 domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
+                     domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
         $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
         
     Generate a tensor `t` with entries generated by `$randf`.

From 4d79f1445eccde9561c9445e16148c965fc3c1e8 Mon Sep 17 00:00:00 2001
From: Lukas Devos <ldevos98@gmail.com>
Date: Thu, 1 Jan 2026 13:48:08 +0100
Subject: [PATCH 19/19] two more format attempt

---
 src/tensors/tensor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index d8118bcd7..300654632 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -319,7 +319,7 @@ for randf in (:rand, :randn, :randexp, :randisometry)
         $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
                      domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
         $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
-        
+
     Generate a tensor `t` with entries generated by `$randf`.
     The type `TorA` can be used to control the element type and
     data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`