diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6c414a31b..2b7905fc4 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -15,12 +15,12 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
 
   - label: "Julia {{matrix.julia}} -- AMDGPU"
     plugins:
@@ -36,9 +36,9 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
           - "1.10"
-          - "1.11"
+          - "1.12"
diff --git a/Project.toml b/Project.toml
index 6c22acc60..f56b35ba7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,23 +18,29 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
 ArgParse = "1.2.0"
+CUDA = "5.9"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 FiniteDifferences = "0.12"
+GPUArrays = "11.3.1"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
-MatrixAlgebraKit = "0.6.0"
+MatrixAlgebraKit = "0.6.1"
 OhMyThreads = "0.8.0"
 Printf = "1"
 Random = "1"
@@ -48,21 +54,26 @@ TestExtras = "0.2,0.3"
 TupleTools = "1.1"
 VectorInterface = "0.4.8, 0.5"
 Zygote = "0.7"
+cuTENSOR = "2"
 julia = "1.10"
 
 [extras]
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["ArgParse", "Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "SafeTestsets", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["ArgParse", "Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
new file mode 100644
index 000000000..417970a02
--- /dev/null
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,21 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, CUDA.CUSOLVER, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
+import TensorKit: randisometry, rand, randn
+
+using TensorKit: MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
new file mode 100644
index 000000000..b4aa7f7a2
--- /dev/null
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,166 @@
+const CuTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, CuVector{T, CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, CuTensorMap{T, S, N₁, N₂}}
+
+function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+    return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
+end
+
+# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
+function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
+    h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
+    h_t = TensorKit.project_symmetric!(h_t, Array(data))
+    # verify result
+    isapprox(Array(reshape(data, dims(h_t))), convert(Array, h_t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:curand, :curandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {S <: IndexSpace}
+            return $randfun(codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+
+        function $randfun!(rng::Random.AbstractRNG, t::CuTensorMap)
+            for (_, b) in blocks(t)
+                $randfun!(rng, b)
+            end
+            return t
+        end
+    end
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
+    inds = findall(!iszero, t.data)
+    return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
+end
+
+function Base.convert(
+        TT::Type{CuTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function LinearAlgebra.isposdef(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: CuTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: CuTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T, S, N₁, N₂}
+end
+
+# CuTensorMap exponentation:
+function TensorKit.exp!(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        error("Exponential of a tensor only exist when domain == codomain.")
+    !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`exp!` is currently only supported on hermitian CUDA tensors"))
+    for (c, b) in blocks(t)
+        copy!(b, parent(Base.exp(Hermitian(b))))
+    end
+    return t
+end
+
+# functions that don't map ℝ to (a subset of) ℝ
+for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
+    sf = string(f)
+    @eval function Base.$f(t::CuTensorMap)
+        domain(t) == codomain(t) ||
+            throw(SpaceMismatch("`$($sf)` of a tensor only exists when domain == codomain"))
+        !MatrixAlgebraKit.ishermitian(t) && throw(ArgumentError("`$($sf)` is currently only supported on hermitian CUDA tensors"))
+        T = complex(float(scalartype(t)))
+        tf = similar(t, T)
+        for (c, b) in blocks(t)
+            copy!(block(tf, c), parent($f(Hermitian(b))))
+        end
+        return tf
+    end
+end
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index 44e7be002..4b2c7d4ee 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -270,20 +270,11 @@ function _norm(blockiter, p::Real, init::Real)
         return mapreduce(max, blockiter; init = init) do (c, b)
             return isempty(b) ? init : oftype(init, LinearAlgebra.normInf(b))
         end
-    elseif p == 2
-        n² = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+    elseif p > 0 # finite positive p
+        np = sum(blockiter; init) do (c, b)
+            return oftype(init, dim(c) * norm(b, p)^p)
         end
-        return sqrt(n²)
-    elseif p == 1
-        return mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * sum(abs, b))
-        end
-    elseif p > 0
-        nᵖ = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
-        end
-        return (nᵖ)^inv(oftype(nᵖ, p))
+        return np^(inv(oftype(np, p)))
     else
         msg = "Norm with non-positive p is not defined for `AbstractTensorMap`"
         throw(ArgumentError(msg))
@@ -298,8 +289,8 @@ function LinearAlgebra.rank(
     )
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
-    S = LinearAlgebra.svdvals(t)
-    tol = max(atol, rtol * maximum(first, values(S)))
+    S = MatrixAlgebraKit.svd_vals(t)
+    tol = max(atol, rtol * maximum(parent(S)))
     for (c, b) in pairs(S)
         if !isempty(b)
             r += dim(c) * count(>(tol), b)
@@ -316,9 +307,9 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
                 throw(SpaceMismatch("`cond` requires domain and codomain to be the same"))
             return zero(real(float(scalartype(t))))
         end
-        S = LinearAlgebra.svdvals(t)
-        maxS = maximum(first, values(S))
-        minS = minimum(last, values(S))
+        S = MatrixAlgebraKit.svd_vals(t)
+        maxS = maximum(parent(S))
+        minS = minimum(parent(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
     else
         throw(ArgumentError("cond currently only defined for p=2"))
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index 452072a87..300654632 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -198,6 +198,15 @@ TensorMap(data::AbstractArray, codom::TensorSpace, dom::TensorSpace; kwargs...)
 Tensor(data::AbstractArray, codom::TensorSpace; kwargs...) =
     TensorMap(data, codom ← one(codom); kwargs...)
 
+function project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: DenseVector{T}}
+    t = TensorMapWithStorage{T, A}(undef, V)
+    t = project_symmetric!(t, data)
+    # verify result
+    isapprox(reshape(data, dims(t)), convert(Array, t); atol = tol) ||
+        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
+    return t
+end
+
 function TensorMapWithStorage{T, A}(
         data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))
     ) where {T, A}
@@ -209,15 +218,7 @@ function TensorMapWithStorage{T, A}(
     sectortype(V) === Trivial &&
         return tensormaptype(spacetype(V), numout(V), numin(V), A)(reshape(data, length(data)), V)
 
-    # do projection
-    t = TensorMapWithStorage{T, A}(undef, V)
-    t = project_symmetric!(t, data)
-
-    # verify result
-    isapprox(reshape(data, dims(t)), convert(Array, t); atol = tol) ||
-        throw(ArgumentError("Data has non-zero elements at incompatible positions"))
-
-    return t
+    return project_symmetric_and_check(T, A, data, V; tol)
 end
 TensorMapWithStorage{T, A}(data::AbstractArray, codom::TensorSpace, dom::TensorSpace; kwargs...) where {T, A} =
     TensorMapWithStorage{T, A}(data, codom ← dom; kwargs...)
@@ -315,11 +316,15 @@ end
 
 for randf in (:rand, :randn, :randexp, :randisometry)
     _docstr = """
-        $randf([rng=default_rng()], [T=Float64], codomain::ProductSpace{S,N₁},
+        $randf([rng=default_rng()], [TorA=Float64], codomain::ProductSpace{S,N₁},
                      domain::ProductSpace{S,N₂}) where {S,N₁,N₂,T} -> t
-        $randf([rng=default_rng()], [T=Float64], codomain ← domain) -> t
-        
+        $randf([rng=default_rng()], [TorA=Float64], codomain ← domain) -> t
+
     Generate a tensor `t` with entries generated by `$randf`.
+    The type `TorA` can be used to control the element type and
+    data type generated. For example, if `TorA` is a `CuVector{ComplexF32}`
+    or `ROCVector{Float64}`, then the final output `TensorMap` will have
+    that as its storage type.
 
     See also [`Random.$(randf)!`](@ref).
     """
@@ -348,25 +353,25 @@ for randf in (:rand, :randn, :randexp, :randisometry)
             return $randfun(codomain ← domain)
         end
         function $randfun(
-                ::Type{T}, codomain::TensorSpace{S}, domain::TensorSpace{S}
-            ) where {T, S <: IndexSpace}
-            return $randfun(T, codomain ← domain)
+                ::Type{TorA}, codomain::TensorSpace{S}, domain::TensorSpace{S}
+            ) where {TorA, S <: IndexSpace}
+            return $randfun(TorA, codomain ← domain)
         end
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, codomain::TensorSpace{S}, domain::TensorSpace{S}
-            ) where {T, S <: IndexSpace}
-            return $randfun(rng, T, codomain ← domain)
+                rng::Random.AbstractRNG, ::Type{TorA}, codomain::TensorSpace{S}, domain::TensorSpace{S}
+            ) where {TorA, S <: IndexSpace}
+            return $randfun(rng, TorA, codomain ← domain)
         end
 
         # accepting single `TensorSpace`
         $randfun(codomain::TensorSpace) = $randfun(codomain ← one(codomain))
-        function $randfun(::Type{T}, codomain::TensorSpace) where {T}
-            return $randfun(T, codomain ← one(codomain))
+        function $randfun(::Type{TorA}, codomain::TensorSpace) where {TorA}
+            return $randfun(TorA, codomain ← one(codomain))
         end
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, codomain::TensorSpace
-            ) where {T}
-            return $randfun(rng, T, codomain ← one(domain))
+                rng::Random.AbstractRNG, ::Type{TorA}, codomain::TensorSpace
+            ) where {TorA}
+            return $randfun(rng, TorA, codomain ← one(domain))
         end
 
         # filling in default eltype
@@ -376,16 +381,16 @@ for randf in (:rand, :randn, :randexp, :randisometry)
         end
 
         # filling in default rng
-        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
-            return $randfun(Random.default_rng(), T, V)
+        function $randfun(::Type{TorA}, V::TensorMapSpace) where {TorA}
+            return $randfun(Random.default_rng(), TorA, V)
         end
         $randfun!(t::AbstractTensorMap) = $randfun!(Random.default_rng(), t)
 
         # implementation
         function $randfun(
-                rng::Random.AbstractRNG, ::Type{T}, V::TensorMapSpace
-            ) where {T}
-            t = TensorMap{T}(undef, V)
+                rng::Random.AbstractRNG, ::Type{TorA}, V::TensorMapSpace
+            ) where {TorA}
+            t = tensormaptype(spacetype(V), numout(V), numin(V), TorA)(undef, V)
             $randfun!(rng, t)
             return t
         end
@@ -399,12 +404,21 @@ for randf in (:rand, :randn, :randexp, :randisometry)
     end
 end
 
+# Moving arbitrary TensorMaps to CPU
+#-----------------------------
+to_cpu(t::TensorMapWithStorage{T, Vector{T}}) where {T} = t # no op
+to_cpu(t::TensorMap) = convert(TensorMapWithStorage{scalartype(t), similarstoragetype(scalartype(t))}, t)
+
 # Efficient copy constructors
 #-----------------------------
 Base.copy(t::TensorMap) = typeof(t)(copy(t.data), t.space)
 
 # Conversion between TensorMap and Dict, for read and write purpose
 #------------------------------------------------------------------
+# We want to store the block data using simple data types,
+# rather tha reshaped views or some other wrapped array type.
+# Since this method is meant for storing data on disk, we can
+# freely collect data to the CPU
 function Base.convert(::Type{Dict}, t::AbstractTensorMap)
     d = Dict{Symbol, Any}()
     d[:codomain] = repr(codomain(t))
@@ -521,6 +535,11 @@ function Base.convert(::Type{TensorMap}, t::AbstractTensorMap)
     return copy!(TensorMap{scalartype(t)}(undef, space(t)), t)
 end
 
+function Base.convert(::Type{TensorMapWithStorage{T, A}}, t::TensorMap) where {T, A}
+    d_data = convert(A, t.data)
+    return TensorMapWithStorage{T, A}(d_data, space(t))
+end
+
 function Base.convert(
         TT::Type{TensorMap{T, S, N₁, N₂, A}}, t::AbstractTensorMap{<:Any, S, N₁, N₂}
     ) where {T, S, N₁, N₂, A}
@@ -535,7 +554,7 @@ end
 function Base.promote_rule(
         ::Type{<:TT₁}, ::Type{<:TT₂}
     ) where {S, N₁, N₂, TT₁ <: TensorMap{<:Any, S, N₁, N₂}, TT₂ <: TensorMap{<:Any, S, N₁, N₂}}
-    A = VectorInterface.promote_add(storagetype(TT₁), storagetype(TT₂))
-    T = scalartype(A)
-    return TensorMap{T, S, N₁, N₂, A}
+    T = VectorInterface.promote_add(scalartype(TT₁), scalartype(TT₂))
+    A = promote_storagetype(similarstoragetype(TT₁, T), similarstoragetype(TT₂, T))
+    return tensormaptype(S, N₁, N₂, A)
 end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
new file mode 100644
index 000000000..8a89a8ad8
--- /dev/null
+++ b/test/cuda/tensors.jl
@@ -0,0 +1,606 @@
+using Adapt, CUDA, cuTENSOR
+using Test, TestExtras
+using TensorKit, Combinatorics
+ad = adapt(Array)
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+const curand = getglobal(CUDAExt, :curand)
+const curandn = getglobal(CUDAExt, :curandn)
+const curand! = getglobal(CUDAExt, :curand!)
+using CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂) #, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            # test default pass-throughs
+            for f in (CUDA.zeros, CUDA.ones, curand, curandn)
+                t = @constinferred f(W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
+            for f in (rand, randn)
+                t = @constinferred f(CuVector{Float64, CUDA.DeviceMemory}, W)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
+            for f! in (curand!, curandn!)
+                t = @constinferred CUDA.zeros(W)
+                f!(t)
+                @test scalartype(t) == Float64
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{Float64, spacetype(t), 5, 0, CuVector{Float64, CUDA.DeviceMemory}}
+            end
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred CUDA.zeros(T, W)
+                CUDA.@allowscalar begin
+                    @test @constinferred(hash(t)) == hash(deepcopy(t))
+                end
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, CuVector{T, CUDA.DeviceMemory}}
+                # blocks
+                bs = @constinferred blocks(t)
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t, first(blocksectors(t)))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test typeof(c) === sectortype(t)
+            end
+        end
+        @timedtestset "Conversion to/from host" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                h_t = @constinferred rand(T, W)
+                t1 = convert(CuTensorMap{T}, h_t)
+                @test collect(t1.data) == h_t.data
+                @test space(t1) == space(h_t)
+                @test scalartype(t1) == T
+                @test codomain(t1) == W
+                @test space(t1) == (W ← one(W))
+                @test domain(t1) == one(W)
+                t2 = CuTensorMap(h_t)
+                @test collect(t2.data) == h_t.data
+                @test space(t2) == space(h_t)
+                @test scalartype(t2) == T
+                @test codomain(t2) == W
+                @test space(t2) == (W ← one(W))
+                @test domain(t2) == one(W)
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                d = convert(Dict, t)
+                @test TensorKit.to_cpu(t) == convert(TensorMap, d)
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                # blocks for adjoint
+                bs = @constinferred blocks(t')
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W'))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t', first(blocksectors(t')))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test typeof(c) === sectortype(t)
+                # linear algebra
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                #@test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2)) # broken for Irrep[CU₁]
+                @test dot(β * t2, α * t) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                i1 = @constinferred(isomorphism(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuVector{T, CUDA.DeviceMemory}, V2 ⊗ V1))
+                w = @constinferred(isometry(CuVector{T, CUDA.DeviceMemory}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuVector{T, CUDA.DeviceMemory}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        @timedtestset "Trivial space insertion and removal" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                t2 = @constinferred insertleftunit(t)
+                @test t2 == @constinferred insertrightunit(t)
+                @test numind(t2) == numind(t) + 1
+                @test space(t2) == insertleftunit(space(t))
+                @test scalartype(t2) === T
+                @test t.data === t2.data
+                @test @constinferred(removeunit(t2, $(numind(t2)))) == t
+                t3 = @constinferred insertleftunit(t; copy = true)
+                @test t3 == @constinferred insertrightunit(t; copy = true)
+                @test t.data !== t3.data
+                for (c, b) in blocks(t)
+                    @test b == block(t3, c)
+                end
+                @test @constinferred(removeunit(t3, $(numind(t3)))) == t
+                t4 = @constinferred insertrightunit(t, 3; dual = true)
+                @test numin(t4) == numin(t) && numout(t4) == numout(t) + 1
+                for (c, b) in blocks(t)
+                    @test b == block(t4, c)
+                end
+                @test @constinferred(removeunit(t4, 4)) == t
+                t5 = @constinferred insertleftunit(t, 4; dual = true)
+                @test numin(t5) == numin(t) + 1 && numout(t5) == numout(t)
+                for (c, b) in blocks(t)
+                    @test b == block(t5, c)
+                end
+                @test @constinferred(removeunit(t5, 4)) == t
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = CUDA.rand(T, W)
+                    t2 = @constinferred CUDA.rand!(similar(t))
+                    α = rand(T)
+                    @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
+                    @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
+                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
+                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred CUDA.randn(T, W, W)
+
+                    tr = @constinferred real(t)
+                    @test scalartype(tr) <: Real
+                    @test real(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tr)
+                    @test storagetype(tr) == CuVector{real(T), CUDA.DeviceMemory}
+
+                    ti = @constinferred imag(t)
+                    @test scalartype(ti) <: Real
+                    @test imag(TensorKit.to_cpu(t)) == TensorKit.to_cpu(ti)
+                    @test storagetype(ti) == CuVector{real(T), CUDA.DeviceMemory}
+
+                    tc = @inferred complex(t)
+                    @test scalartype(tc) <: Complex
+                    @test complex(TensorKit.to_cpu(t)) == TensorKit.to_cpu(tc)
+                    @test storagetype(tc) == CuVector{complex(T), CUDA.DeviceMemory}
+
+                    tc2 = @inferred complex(tr, ti)
+                    @test tc2 ≈ tc
+                    @test storagetype(tc2) == CuVector{complex(T), CUDA.DeviceMemory}
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin # TODO adjoint conversion methods don't work yet
+            W = V1 ⊗ V2
+            t = @constinferred CUDA.randn(W ← W)
+            #@test typeof(convert(TensorMap, t')) == typeof(t) # TODO Adjoint not supported yet
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            # @test typeof(convert(typeof(tc), t')) == typeof(tc) # TODO Adjoint not supported yet
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = CUDA.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use CUDA here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = CUDA.rand(ComplexF64, W)
+            t′ = CUDA.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    CUDA.@allowscalar begin
+                        t2 = @constinferred permute(t, (p1, p2))
+                        t2 = permute(t, (p1, p2))
+                        @test norm(t2) ≈ norm(t)
+                        t2′ = permute(t′, (p1, p2))
+                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    end
+                end
+
+                CUDA.@allowscalar begin
+                    t3 = @constinferred repartition(t, $k)
+                    t3 = repartition(t, k)
+                    @test norm(t3) ≈ norm(t)
+                    t3′ = @constinferred repartition!(similar(t3), t′)
+                    @test norm(t3′) ≈ norm(t′)
+                    @test dot(t′, t) ≈ dot(t3′, t3)
+                end
+            end
+        end
+        if BraidingStyle(I) isa SymmetricBraiding
+            @timedtestset "Permutations: test via CPU" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = CUDA.rand(ComplexF64, W)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        dt2 = CUDA.@allowscalar permute(t, (p1, p2))
+                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
+                        @test ht2 == TensorKit.to_cpu(dt2)
+                    end
+
+                    dt3 = CUDA.@allowscalar repartition(t, k)
+                    ht3 = repartition(TensorKit.to_cpu(t), k)
+                    @test ht3 == TensorKit.to_cpu(dt3)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            CUDA.@allowscalar begin
+                t2 = permute(t, ((1, 2), (4, 3)))
+                s = @constinferred tr(t2)
+                @test conj(s) ≈ tr(t2')
+                if !isdual(V1)
+                    t2 = twist!(t2, 1)
+                end
+                if isdual(V2)
+                    t2 = twist!(t2, 2)
+                end
+                ss = tr(t2)
+                @tensor s2 = t[a, b, b, a]
+                @tensor t3[a, b] := t[a, c, c, b]
+                @tensor s3 = t3[a, a]
+            end
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                CUDA.@allowscalar begin
+                    @tensor t2[a, b] := t[c, d, b, d, c, a]
+                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                end
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            CUDA.@allowscalar begin
+                t3 = t1 ⊗ t2
+                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+                @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            end
+            @test ta ≈ tb
+        end
+        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via CPU" begin
+                dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
+                dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
+                drhoL = CUDA.randn(ComplexF64, V1, V1)
+                drhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                dH = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor dHrA12[a, s1, s2, c] := drhoL[a, a'] * conj(dA1[a', t1, b]) *
+                    dA2[b, t2, c'] * drhoR[c', c] *
+                    dH[s1, s2, t1, t2]
+                @tensor hHrA12[a, s1, s2, c] := TensorKit.to_cpu(drhoL)[a, a'] * conj(TensorKit.to_cpu(dA1)[a', t1, b]) *
+                    TensorKit.to_cpu(dA2)[b, t2, c'] * TensorKit.to_cpu(drhoR)[c', c] *
+                    TensorKit.to_cpu(dH)[s1, s2, t1, t2]
+                @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
+            end
+        end=# # doesn't yet work because of AdjointTensor
+        @timedtestset "Index flipping: test flipping inverse" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            for i in 1:4
+                CUDA.@allowscalar begin
+                    @test t ≈ flip(flip(t, i), i; inv = true)
+                    @test t ≈ flip(flip(t, i; inv = true), i)
+                end
+            end
+        end
+        #=@timedtestset "Index flipping: test via explicit flip" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            F1 = unitary(flip(V1), V1)
+
+            CUDA.@allowscalar begin
+                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+                @test flip(t, 1) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+                @test twist!(flip(t, 2), 2) ≈ tf
+                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+                @test flip(t, 3) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+                @test twist!(flip(t, 4), 4) ≈ tf
+            end
+        end
+        @timedtestset "Index flipping: test via contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            CUDA.@allowscalar begin
+                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+                @test flip(ta, (1, 2)) ≈ tb
+            end
+        end=# # TODO
+        @timedtestset "Multiplication of isometries: test properties" begin
+            W2 = V4 ⊗ V5
+            W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
+            for T in (Float64, ComplexF64)
+                t1 = randisometry(CuMatrix{T}, W1, W2)
+                t2 = randisometry(CuMatrix{T}, W2 ← W2)
+                @test isisometric(t1)
+                @test isunitary(t2)
+                P = t1 * t1'
+                @test P * P ≈ P
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t = CUDA.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        @timedtestset "Multiplication and inverse: test via CPU" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float32, Float64, ComplexF32, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t = CUDA.rand(T, W1, W2)
+                ht1 = TensorKit.to_cpu(t1)
+                ht2 = TensorKit.to_cpu(t2)
+                ht = TensorKit.to_cpu(t)
+                @test TensorKit.to_cpu(t1 * t) ≈ ht1 * ht
+                @test TensorKit.to_cpu(t1' * t) ≈ ht1' * ht
+                @test TensorKit.to_cpu(t2 * t') ≈ ht2 * ht'
+                @test TensorKit.to_cpu(t2' * t') ≈ ht2' * ht'
+
+                @test TensorKit.to_cpu(inv(t1)) ≈ inv(ht1)
+                @test TensorKit.to_cpu(pinv(t)) ≈ pinv(ht)
+
+                if T == Float32 || T == ComplexF32
+                    continue
+                end
+
+                @test TensorKit.to_cpu(t1 \ t) ≈ ht1 \ ht
+                @test TensorKit.to_cpu(t1' \ t) ≈ ht1' \ ht
+                @test TensorKit.to_cpu(t2 \ t') ≈ ht2 \ ht'
+                @test TensorKit.to_cpu(t2' \ t') ≈ ht2' \ ht'
+
+                @test TensorKit.to_cpu(t2 / t) ≈ ht2 / ht
+                @test TensorKit.to_cpu(t2' / t) ≈ ht2' / ht
+                @test TensorKit.to_cpu(t1 / t') ≈ ht1 / ht'
+                @test TensorKit.to_cpu(t1' / t') ≈ ht1' / ht'
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = project_hermitian!(CUDA.randn(T, W, W))
+                    s = dim(W)
+                    #@test (@constinferred sqrt(t))^2 ≈ t
+                    #@test TensorKit.to_cpu(sqrt(t)) ≈ sqrt(TensorKit.to_cpu(t))
+
+                    expt = @constinferred exp(t)
+                    @test TensorKit.to_cpu(expt) ≈ exp(TensorKit.to_cpu(t))
+
+                    # log doesn't work on CUDA yet (scalar indexing)
+                    #@test exp(@constinferred log(project_hermitian!(expt))) ≈ expt
+                    #@test TensorKit.to_cpu(log(project_hermitian!(expt))) ≈ log(TensorKit.to_cpu(expt))
+
+                    #=@test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)=# # TODO in CUDA
+
+                    #=t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8=#
+                    # TODO in CUDA
+                end
+            end
+        end
+        # Sylvester not defined for CUDA
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                end
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    CUDA.@allowscalar begin
+                        t = @constinferred (t1 ⊗ t2)
+                        At = ad(t)
+                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                    end
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                    # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
+                end
+            end
+        end
+    end
+    TensorKit.empty_globalcaches!()
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = CUDA.rand(T, W2, W1 ⊗ W1')
+            CUDA.@allowscalar begin
+                t = @constinferred (t1 ⊠ t2)
+            end
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            CUDA.@allowscalar begin
+                @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 854991a7f..9513378db 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,8 +2,6 @@
 # ------------
 using ArgParse: ArgParse
 using SafeTestsets: @safetestset
-# using CUDA: CUDA
-# using AMDGPU: AMDGPU
 
 function parse_commandline(args = ARGS)
     s = ArgParse.ArgParseSettings()
@@ -49,9 +47,9 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
 
     # handle GPU cases separately
     if group == "cuda"
-        # CUDA.functional() || continue
-    elseif group == "amd"
-        # AMDGPU.functional() || continue
+        using CUDA
+        CUDA.functional() || continue
+        @time include("cuda/tensors.jl")
     elseif is_buildkite
         continue
     end