diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6c414a31b..250cc4ce3 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -15,7 +15,7 @@ steps:
       queue: "juliagpu"
       cuda: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
@@ -36,7 +36,7 @@ steps:
       rocm: "*"
       rocmgpu: "*"
     if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 30
+    timeout_in_minutes: 60
     matrix:
       setup:
         julia:
diff --git a/Project.toml b/Project.toml
index 6c22acc60..a45e112fc 100644
--- a/Project.toml
+++ b/Project.toml
@@ -18,20 +18,33 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
+
+[sources]
+GPUArrays = {rev = "master", url = "https://github.com/JuliaGPU/GPUArrays.jl"}
+MatrixAlgebraKit = {rev = "main", url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl"}
 
 [extensions]
+TensorKitAMDGPUExt = "AMDGPU"
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+AMDGPU = "2"
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
 ArgParse = "1.2.0"
+CUDA = "5.9"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 FiniteDifferences = "0.12"
+GPUArrays = "11.3.1"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
 MatrixAlgebraKit = "0.6.0"
@@ -48,21 +61,27 @@ TestExtras = "0.2,0.3"
 TupleTools = "1.1"
 VectorInterface = "0.4.8, 0.5"
 Zygote = "0.7"
+cuTENSOR = "2"
 julia = "1.10"
 
 [extras]
-ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
-test = ["ArgParse", "Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "SafeTestsets", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["ArgParse", "Adapt", "AMDGPU", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
new file mode 100644
index 000000000..a9e5bd021
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -0,0 +1,108 @@
+module TensorKitAMDGPUExt
+
+using AMDGPU, AMDGPU.rocBLAS, LinearAlgebra
+using AMDGPU: @allowscalar
+import AMDGPU: rand as rocrand, rand! as rocrand!, randn as rocrandn, randn! as rocrandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("roctensormap.jl")
+
+const ROCDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
+
+"""
+    ROCDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
+    # expert mode: select storage type `A`
+    DiagonalTensorMap{T,S,A}(undef, domain::S) where {T,S<:IndexSpace,A<:DenseVector{T}}
+
+Construct a `DiagonalTensorMap` with uninitialized data.
+"""
+function ROCDiagonalTensorMap{T}(::UndefInitializer, V::TensorMapSpace) where {T}
+    (numin(V) == numout(V) == 1 && domain(V) == codomain(V)) ||
+        throw(ArgumentError("DiagonalTensorMap requires a space with equal domain and codomain and 2 indices"))
+    return ROCDiagonalTensorMap{T}(undef, domain(V))
+end
+function ROCDiagonalTensorMap{T}(::UndefInitializer, V::ProductSpace) where {T}
+    length(V) == 1 ||
+        throw(ArgumentError("DiagonalTensorMap requires `numin(d) == numout(d) == 1`"))
+    return ROCDiagonalTensorMap{T}(undef, only(V))
+end
+function ROCDiagonalTensorMap{T}(::UndefInitializer, V::S) where {T, S <: IndexSpace}
+    return ROCDiagonalTensorMap{T, S}(undef, V)
+end
+ROCDiagonalTensorMap(::UndefInitializer, V::IndexSpace) = ROCDiagonalTensorMap{Float64}(undef, V)
+
+function ROCDiagonalTensorMap(data::ROCVector{T}, V::S) where {T, S}
+    return ROCDiagonalTensorMap{T, S}(data, V)
+end
+
+function ROCDiagonalTensorMap(data::Vector{T}, V::S) where {T, S}
+    return ROCDiagonalTensorMap{T, S}(ROCVector{T}(data), V)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_full!), t::ROCDiagonalTensorMap, alg::DiagonalAlgorithm)
+    V_cod = fuse(codomain(t))
+    V_dom = fuse(domain(t))
+    U = similar(t, codomain(t) ← V_cod)
+    S = ROCDiagonalTensorMap{real(scalartype(t))}(undef, V_cod ← V_dom)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_vals!), t::ROCTensorMap, alg::AbstractAlgorithm)
+    V_cod = infimum(fuse(codomain(t)), fuse(domain(t)))
+    return ROCDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_compact!), t::ROCTensorMap, ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = ROCDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eigh_full!), t::ROCTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = ROCDiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eig_full!), t::ROCTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = ROCDiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eigh_vals!), t::ROCTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = ROCDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eig_vals!), t::ROCTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = ROCDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+
+# TODO
+# add VectorInterface extensions for proper AMDGPU promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:AMDGPU.StridedROCMatrix{Tx}}, TB::Type{<:AMDGPU.StridedROCMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
new file mode 100644
index 000000000..f4564c1e2
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -0,0 +1,257 @@
+const ROCTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
+const ROCTensor{T, S, N} = ROCTensorMap{T, S, N, 0}
+
+const AdjointROCTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, ROCTensorMap{T, S, N₁, N₂}}
+
+function TensorKit.tensormaptype(S::Type{<:IndexSpace}, N₁, N₂, TorA::Type{<:StridedROCArray})
+    if TorA <: ROCArray
+        return TensorMap{eltype(TorA), S, N₁, N₂, ROCVector{eltype(TorA), AMDGPU.Mem.HIPBuffer}}
+    else
+        throw(ArgumentError("argument $TorA should specify a scalar type (`<:Number`) or a storage type `<:ROCVector{<:Number}`"))
+    end
+end
+
+function ROCTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return ROCTensorMap{T, S, N₁, N₂}(undef, V)
+end
+
+function ROCTensorMap{T}(
+        ::UndefInitializer, codomain::TensorSpace{S},
+        domain::TensorSpace{S}
+    ) where {T, S}
+    return ROCTensorMap{T}(undef, codomain ← domain)
+end
+function ROCTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T, S}
+    return ROCTensorMap{T}(undef, V ← one(V))
+end
+# constructor starting from block data
+"""
+    ROCTensorMap(data::AbstractDict{<:Sector,<:ROCMatrix}, codomain::ProductSpace{S,N₁},
+                domain::ProductSpace{S,N₂}) where {S<:ElementarySpace,N₁,N₂}
+    ROCTensorMap(data, codomain ← domain)
+    ROCTensorMap(data, domain → codomain)
+
+Construct a `ROCTensorMap` by explicitly specifying its block data.
+
+## Arguments
+- `data::AbstractDict{<:Sector,<:ROCMatrix}`: dictionary containing the block data for
+  each coupled sector `c` as a matrix of size `(blockdim(codomain, c), blockdim(domain, c))`.
+- `codomain::ProductSpace{S,N₁}`: the codomain as a `ProductSpace` of `N₁` spaces of type
+  `S<:ElementarySpace`.
+- `domain::ProductSpace{S,N₂}`: the domain as a `ProductSpace` of `N₂` spaces of type
+  `S<:ElementarySpace`.
+
+Alternatively, the domain and codomain can be specified by passing a [`HomSpace`](@ref)
+using the syntax `codomain ← domain` or `domain → codomain`.
+"""
+function ROCTensorMap(
+        data::AbstractDict{<:Sector, <:ROCArray},
+        V::TensorMapSpace{S, N₁, N₂}
+    ) where {S, N₁, N₂}
+    T = eltype(valtype(data))
+    t = ROCTensorMap{T}(undef, V)
+    for (c, b) in blocks(t)
+        haskey(data, c) || throw(SectorMismatch("no data for block sector $c"))
+        datac = data[c]
+        size(datac) == size(b) ||
+            throw(DimensionMismatch("wrong size of block for sector $c"))
+        copy!(b, datac)
+    end
+    for (c, b) in data
+        c ∈ blocksectors(t) || isempty(b) ||
+            throw(SectorMismatch("data for block sector $c not expected"))
+    end
+    return t
+end
+function ROCTensorMap{T}(
+        data::DenseVector{T}, codomain::TensorSpace{S},
+        domain::TensorSpace{S}
+    ) where {T, S}
+    return ROCTensorMap(data, codomain ← domain)
+end
+function ROCTensorMap(
+        data::AbstractDict{<:Sector, <:ROCMatrix}, codom::TensorSpace{S},
+        dom::TensorSpace{S}
+    ) where {S}
+    return ROCTensorMap(data, codom ← dom)
+end
+
+function ROCTensorMap(ts::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
+    return ROCTensorMap{T, S, N₁, N₂}(ROCArray(ts.data), ts.space)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function AMDGPU.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return AMDGPU.$fname(codomain ← domain)
+        end
+        function AMDGPU.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return AMDGPU.$fname(T, codomain ← domain)
+        end
+        AMDGPU.$fname(V::TensorMapSpace) = AMDGPU.$fname(Float64, V)
+        function AMDGPU.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rocrand, :rocrandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        $randfun(codomain::TensorSpace{S}, domain::TensorSpace{S} = one(codomain)) where {S} = $randfun(codomain ← domain)
+        function $randfun(::Type{T}, codomain::TensorSpace{S}, domain::TensorSpace{S} = one(codomain)) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{ROCTensorMap}, d::Dict{Symbol, Any})
+    try
+        codomain = eval(TensorKit, Meta.parse(d[:codomain]))
+        domain = eval(TensorKit, Meta.parse(d[:domain]))
+        data = SectorDict(eval(TensorKit, Meta.parse(c)) => ROCArray(b) for (c, b) in d[:data])
+        return ROCTensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(
+            Base.eval(Main, Meta.parse(c)) => ROCArray(b)
+                for (c, b) in d[:data]
+        )
+        return ROCTensorMap(data, codomain, domain)
+    end
+end
+
+function Base.convert(::Type{ROCTensorMap}, t::AbstractTensorMap)
+    return copy!(ROCTensorMap{scalartype(t)}(undef, space(t)), t)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::ROCTensorMap)
+
+    # TODO: should scalar only work if N₁ == N₂ == 0?
+    return @allowscalar dim(codomain(t)) == dim(domain(t)) == 1 ?
+        first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
+end
+
+TensorKit.scalartype(A::StridedROCArray{T}) where {T} = T
+TensorKit.scalartype(::Type{<:ROCTensorMap{T}}) where {T} = T
+TensorKit.scalartype(::Type{<:ROCArray{T}}) where {T} = T
+
+function TensorKit.similarstoragetype(TT::Type{<:ROCTensorMap{TTT, S, N₁, N₂}}, ::Type{T}) where {TTT, T, S, N₁, N₂}
+    return ROCVector{T, AMDGPU.Mem.HIPBuffer}
+end
+
+function Base.convert(
+        TT::Type{ROCTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function Base.copy!(tdst::ROCTensorMap{T, S, N₁, N₂}, tsrc::ROCTensorMap{T, S, N₁, N₂}) where {T, S, N₁, N₂}
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.copy!(tdst::ROCTensorMap, tsrc::TensorKit.AdjointTensorMap)
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: ROCTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: ROCTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return ROCTensorMap{T, S, N₁, N₂}
+end
+
+function LinearAlgebra.isposdef(t::ROCTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = TensorKit.MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+# Conversion to ROCArray:
+#----------------------
+# probably not optimized for speed, only for checking purposes
+function Base.convert(::Type{ROCArray}, t::AbstractTensorMap)
+    I = sectortype(t)
+    if I === Trivial
+        convert(ROCArray, t[])
+    else
+        cod = codomain(t)
+        dom = domain(t)
+        T = sectorscalartype(I) <: Complex ? complex(scalartype(t)) :
+            sectorscalartype(I) <: Integer ? scalartype(t) : float(scalartype(t))
+        A = AMDGPU.zeros(T, dims(cod)..., dims(dom)...)
+        for (f₁, f₂) in fusiontrees(t)
+            F = convert(ROCArray, (f₁, f₂))
+            Aslice = StridedView(A)[axes(cod, f₁.uncoupled)..., axes(dom, f₂.uncoupled)...]
+            add!(Aslice, StridedView(TensorKit._kron(convert(ROCArray, t[f₁, f₂]), F)))
+        end
+        return A
+    end
+end
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
new file mode 100644
index 000000000..443e0fff0
--- /dev/null
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,106 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+import CUDA: rand as curand, rand! as curand!, randn as curandn, randn! as curandn!
+
+using TensorKit
+using TensorKit.Factorizations
+using TensorKit.Strided
+using TensorKit.Factorizations: AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype
+import TensorKit: randisometry
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T, CUDA.DeviceMemory}}
+
+"""
+    CuDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
+    # expert mode: select storage type `A`
+    DiagonalTensorMap{T,S,A}(undef, domain::S) where {T,S<:IndexSpace,A<:DenseVector{T}}
+
+Construct a `DiagonalTensorMap` with uninitialized data.
+"""
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::TensorMapSpace) where {T}
+    (numin(V) == numout(V) == 1 && domain(V) == codomain(V)) ||
+        throw(ArgumentError("DiagonalTensorMap requires a space with equal domain and codomain and 2 indices"))
+    return CuDiagonalTensorMap{T}(undef, domain(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::ProductSpace) where {T}
+    length(V) == 1 ||
+        throw(ArgumentError("DiagonalTensorMap requires `numin(d) == numout(d) == 1`"))
+    return CuDiagonalTensorMap{T}(undef, only(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::S) where {T, S <: IndexSpace}
+    return CuDiagonalTensorMap{T, S}(undef, V)
+end
+CuDiagonalTensorMap(::UndefInitializer, V::IndexSpace) = CuDiagonalTensorMap{Float64}(undef, V)
+
+function CuDiagonalTensorMap(data::CuVector{T}, V::S) where {T, S}
+    return CuDiagonalTensorMap{T, S}(data, V)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_full!), t::CuDiagonalTensorMap, alg::DiagonalAlgorithm)
+    V_cod = fuse(codomain(t))
+    V_dom = fuse(domain(t))
+    U = similar(t, codomain(t) ← V_cod)
+    S = CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod ← V_dom)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_cod = infimum(fuse(codomain(t)), fuse(domain(t)))
+    return CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(svd_compact!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eigh_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = CuDiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eig_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = CuDiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eigh_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function TensorKit.Factorizations.MAK.initialize_output(::typeof(eig_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+
+# TODO
+# add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
new file mode 100644
index 000000000..127eb6cf4
--- /dev/null
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,328 @@
+const CuTensorMap{T, S, N₁, N₂} = TensorMap{T, S, N₁, N₂, CuVector{T, CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂, CuTensorMap{T, S, N₁, N₂}}
+
+function TensorKit.tensormaptype(S::Type{<:IndexSpace}, N₁, N₂, TorA::Type{<:StridedCuArray})
+    if TorA <: CuArray
+        return TensorMap{eltype(TorA), S, N₁, N₂, CuVector{eltype(TorA), CUDA.DeviceMemory}}
+    else
+        throw(ArgumentError("argument $TorA should specify a scalar type (`<:Number`) or a storage type `<:CuVector{<:Number}`"))
+    end
+end
+
+function CuTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return CuTensorMap{T, S, N₁, N₂}(undef, V)
+end
+
+function CuTensorMap{T}(
+        ::UndefInitializer, codomain::TensorSpace{S},
+        domain::TensorSpace{S}
+    ) where {T, S}
+    return CuTensorMap{T}(undef, codomain ← domain)
+end
+function CuTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T, S}
+    return CuTensorMap{T}(undef, V ← one(V))
+end
+# constructor starting from block data
+"""
+    CuTensorMap(data::AbstractDict{<:Sector,<:CuMatrix}, codomain::ProductSpace{S,N₁},
+                domain::ProductSpace{S,N₂}) where {S<:ElementarySpace,N₁,N₂}
+    CuTensorMap(data, codomain ← domain)
+    CuTensorMap(data, domain → codomain)
+
+Construct a `CuTensorMap` by explicitly specifying its block data.
+
+## Arguments
+- `data::AbstractDict{<:Sector,<:CuMatrix}`: dictionary containing the block data for
+  each coupled sector `c` as a matrix of size `(blockdim(codomain, c), blockdim(domain, c))`.
+- `codomain::ProductSpace{S,N₁}`: the codomain as a `ProductSpace` of `N₁` spaces of type
+  `S<:ElementarySpace`.
+- `domain::ProductSpace{S,N₂}`: the domain as a `ProductSpace` of `N₂` spaces of type
+  `S<:ElementarySpace`.
+
+Alternatively, the domain and codomain can be specified by passing a [`HomSpace`](@ref)
+using the syntax `codomain ← domain` or `domain → codomain`.
+"""
+function CuTensorMap(
+        data::AbstractDict{<:Sector, <:CuArray},
+        V::TensorMapSpace{S, N₁, N₂}
+    ) where {S, N₁, N₂}
+    T = eltype(valtype(data))
+    t = CuTensorMap{T}(undef, V)
+    for (c, b) in blocks(t)
+        haskey(data, c) || throw(SectorMismatch("no data for block sector $c"))
+        datac = data[c]
+        size(datac) == size(b) ||
+            throw(DimensionMismatch("wrong size of block for sector $c"))
+        copy!(b, datac)
+    end
+    for (c, b) in data
+        c ∈ blocksectors(t) || isempty(b) ||
+            throw(SectorMismatch("data for block sector $c not expected"))
+    end
+    return t
+end
+function CuTensorMap{T}(
+        data::DenseVector{T}, codomain::TensorSpace{S},
+        domain::TensorSpace{S}
+    ) where {T, S}
+    return CuTensorMap(data, codomain ← domain)
+end
+function CuTensorMap(
+        data::AbstractDict{<:Sector, <:CuMatrix}, codom::TensorSpace{S},
+        dom::TensorSpace{S}
+    ) where {S}
+    return CuTensorMap(data, codom ← dom)
+end
+function CuTensorMap(data::DenseVector{T}, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return CuTensorMap{T, S, N₁, N₂}(data, V)
+end
+function CuTensorMap(data::CuArray{T}, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return CuTensorMap{T, S, N₁, N₂}(vec(data), V)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {S <: IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain)
+            ) where {T, S <: IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:curand, :curandn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {S <: IndexSpace}
+            return $randfun(codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(T, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S} = one(codomain),
+            ) where {T, S <: IndexSpace}
+            return $randfun(rng, T, codomain ← domain)
+        end
+
+        # filling in default eltype
+        $randfun(V::TensorMapSpace) = $randfun(Float64, V)
+        function $randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return $randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return $randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                V::TensorMapSpace
+            ) where {T}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn, :randisometry)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function $randfun(
+                ::Type{A}, codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {A <: CuArray, S <: IndexSpace}
+            return $randfun(A, codomain ← domain)
+        end
+        function $randfun(
+                ::Type{T}, ::Type{A}, codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {T, S <: IndexSpace, A <: CuArray{T}}
+            return $randfun(T, A, codomain ← domain)
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T}, ::Type{A},
+                codomain::TensorSpace{S},
+                domain::TensorSpace{S}
+            ) where {T, S <: IndexSpace, A <: CuArray{T}}
+            return $randfun(rng, T, A, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        $randfun(::Type{A}, codomain::TensorSpace) where {A <: CuArray} = $randfun(A, codomain ← one(codomain))
+        function $randfun(::Type{T}, ::Type{A}, codomain::TensorSpace) where {T, A <: CuArray{T}}
+            return $randfun(T, A, codomain ← one(codomain))
+        end
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                ::Type{A}, codomain::TensorSpace
+            ) where {T, A <: CuArray{T}}
+            return $randfun(rng, T, A, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        $randfun(::Type{A}, V::TensorMapSpace) where {A <: CuArray} = $randfun(eltype(A), A, V)
+        function $randfun(rng::Random.AbstractRNG, ::Type{A}, V::TensorMapSpace) where {A <: CuArray}
+            return $randfun(rng, eltype(A), A, V)
+        end
+
+        # filling in default rng
+        function $randfun(::Type{T}, ::Type{A}, V::TensorMapSpace) where {T, A <: CuArray{T}}
+            return $randfun(Random.default_rng(), T, A, V)
+        end
+
+        # implementation
+        function $randfun(
+                rng::Random.AbstractRNG, ::Type{T},
+                ::Type{A}, V::TensorMapSpace
+            ) where {T, A <: CuArray{T}}
+            t = CuTensorMap{T}(undef, V)
+            $randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{CuTensorMap}, d::Dict{Symbol, Any})
+    try
+        codomain = eval(TensorKit, Meta.parse(d[:codomain]))
+        domain = eval(TensorKit, Meta.parse(d[:domain]))
+        data = SectorDict(eval(TensorKit, Meta.parse(c)) => CuArray(b) for (c, b) in d[:data])
+        return CuTensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(
+            Base.eval(Main, Meta.parse(c)) => CuArray(b)
+                for (c, b) in d[:data]
+        )
+        return CuTensorMap(data, codomain, domain)
+    end
+end
+
+function Base.convert(::Type{CuTensorMap}, t::AbstractTensorMap)
+    return copy!(CuTensorMap{scalartype(t)}(undef, space(t)), t)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap)
+
+    # TODO: should scalar only work if N₁ == N₂ == 0?
+    return @allowscalar dim(codomain(t)) == dim(domain(t)) == 1 ?
+        first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
+end
+
+TensorKit.scalartype(A::StridedCuArray{T}) where {T} = T
+TensorKit.scalartype(::Type{<:CuTensorMap{T}}) where {T} = T
+TensorKit.scalartype(::Type{<:CuArray{T}}) where {T} = T
+
+function TensorKit.similarstoragetype(TT::Type{<:CuTensorMap{TTT, S, N₁, N₂}}, ::Type{T}) where {TTT, T, S, N₁, N₂}
+    return CuVector{T, CUDA.DeviceMemory}
+end
+
+function Base.convert(
+        TT::Type{CuTensorMap{T, S, N₁, N₂}},
+        t::AbstractTensorMap{<:Any, S, N₁, N₂}
+    ) where {T, S, N₁, N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function Base.copy!(tdst::CuTensorMap{T, S, N₁, N₂}, tsrc::CuTensorMap{T, S, N₁, N₂}) where {T, S, N₁, N₂}
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.copy!(tdst::CuTensorMap, tsrc::TensorKit.AdjointTensorMap)
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.promote_rule(
+        ::Type{<:TT₁},
+        ::Type{<:TT₂}
+    ) where {
+        S, N₁, N₂, TTT₁, TTT₂,
+        TT₁ <: CuTensorMap{TTT₁, S, N₁, N₂},
+        TT₂ <: CuTensorMap{TTT₂, S, N₁, N₂},
+    }
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T, S, N₁, N₂}
+end
+
+function LinearAlgebra.isposdef(t::CuTensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        # do our own hermitian check
+        isherm = TensorKit.MatrixAlgebraKit.ishermitian(b; atol = eps(real(eltype(b))), rtol = eps(real(eltype(b))))
+        isherm || return false
+        isposdef(Hermitian(b)) || return false
+    end
+    return true
+end
+
+
+# Conversion to CuArray:
+#----------------------
+# probably not optimized for speed, only for checking purposes
+function Base.convert(::Type{CuArray}, t::AbstractTensorMap)
+    I = sectortype(t)
+    if I === Trivial
+        convert(CuArray, t[])
+    else
+        cod = codomain(t)
+        dom = domain(t)
+        T = sectorscalartype(I) <: Complex ? complex(scalartype(t)) :
+            sectorscalartype(I) <: Integer ? scalartype(t) : float(scalartype(t))
+        A = CUDA.zeros(T, dims(cod)..., dims(dom)...)
+        for (f₁, f₂) in fusiontrees(t)
+            F = convert(CuArray, (f₁, f₂))
+            Aslice = StridedView(A)[axes(cod, f₁.uncoupled)..., axes(dom, f₂.uncoupled)...]
+            add!(Aslice, StridedView(TensorKit._kron(convert(CuArray, t[f₁, f₂]), F)))
+        end
+        return A
+    end
+end
diff --git a/src/auxiliary/random.jl b/src/auxiliary/random.jl
index 022809518..9aed16509 100644
--- a/src/auxiliary/random.jl
+++ b/src/auxiliary/random.jl
@@ -1,17 +1,20 @@
 """
-    randisometry([::Type{T}=Float64], dims::Dims{2}) -> Array{T,2}
-    randhaar([::Type{T}=Float64], dims::Dims{2}) -> Array{T,2}
+    randisometry([::Type{T}=Float64], [::Type{A}=Matrix{T}], dims::Dims{2}) -> A
+    randhaar([::Type{T}=Float64], [::Type{A}=Matrix{T}], dims::Dims{2}) -> A
 
 Create a random isometry of size `dims`, uniformly distributed according to the Haar measure.
 
 See also [`randuniform`](@ref) and [`randnormal`](@ref).
 """
-randisometry(dims::Base.Dims{2}) = randisometry(Float64, dims)
+randisometry(dims::Base.Dims{2}) = randisometry(Float64, Matrix{Float64}, dims)
 function randisometry(::Type{T}, dims::Base.Dims{2}) where {T <: Number}
     return randisometry(Random.default_rng(), T, dims)
 end
-function randisometry(rng::Random.AbstractRNG, ::Type{T}, dims::Base.Dims{2}) where {T <: Number}
-    return randisometry!(rng, Matrix{T}(undef, dims))
+function randisometry(::Type{T}, ::Type{A}, dims::Base.Dims{2}) where {T <: Number, A <: AbstractArray{T}}
+    return randisometry(Random.default_rng(), T, A, dims)
+end
+function randisometry(rng::Random.AbstractRNG, ::Type{T}, ::Type{A}, dims::Base.Dims{2}) where {T <: Number, A <: AbstractArray{T}}
+    return randisometry!(rng, A(undef, dims))
 end
 
 randisometry!(A::AbstractMatrix) = randisometry!(Random.default_rng(), A)
diff --git a/src/factorizations/adjoint.jl b/src/factorizations/adjoint.jl
index 29f27203c..f9c2742e5 100644
--- a/src/factorizations/adjoint.jl
+++ b/src/factorizations/adjoint.jl
@@ -6,6 +6,8 @@ _adjoint(alg::MAK.LAPACK_HouseholderQR) = MAK.LAPACK_HouseholderLQ(; alg.kwargs.
 _adjoint(alg::MAK.LAPACK_HouseholderLQ) = MAK.LAPACK_HouseholderQR(; alg.kwargs...)
 _adjoint(alg::MAK.LAPACK_HouseholderQL) = MAK.LAPACK_HouseholderRQ(; alg.kwargs...)
 _adjoint(alg::MAK.LAPACK_HouseholderRQ) = MAK.LAPACK_HouseholderQL(; alg.kwargs...)
+_adjoint(alg::MAK.LQViaTransposedQR) = alg.qr_alg
+_adjoint(alg::MAK.CUSOLVER_HouseholderQR) = MAK.LQViaTransposedQR(alg)
 _adjoint(alg::MAK.PolarViaSVD) = MAK.PolarViaSVD(_adjoint(alg.svd_alg))
 _adjoint(alg::AbstractAlgorithm) = alg
 
diff --git a/src/factorizations/factorizations.jl b/src/factorizations/factorizations.jl
index dce2067ee..fe69bf8d9 100644
--- a/src/factorizations/factorizations.jl
+++ b/src/factorizations/factorizations.jl
@@ -6,7 +6,7 @@ module Factorizations
 export copy_oftype, factorisation_scalartype, one!, truncspace
 
 using ..TensorKit
-using ..TensorKit: AdjointTensorMap, SectorDict, blocktype, foreachblock, one!
+using ..TensorKit: AdjointTensorMap, SectorDict, blocktype, foreachblock, one!, similarstoragetype
 
 using LinearAlgebra: LinearAlgebra, BlasFloat, Diagonal, svdvals, svdvals!, eigen, eigen!,
     isposdef, isposdef!, ishermitian
@@ -28,7 +28,6 @@ include("diagonal.jl")
 include("pullbacks.jl")
 
 TensorKit.one!(A::AbstractMatrix) = MatrixAlgebraKit.one!(A)
-
 #------------------------------#
 # LinearAlgebra overloads
 #------------------------------#
diff --git a/src/factorizations/matrixalgebrakit.jl b/src/factorizations/matrixalgebrakit.jl
index 412fd8761..dc955bdb7 100644
--- a/src/factorizations/matrixalgebrakit.jl
+++ b/src/factorizations/matrixalgebrakit.jl
@@ -83,14 +83,16 @@ end
 function MAK.initialize_output(::typeof(svd_compact!), t::AbstractTensorMap, ::AbstractAlgorithm)
     V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
     U = similar(t, codomain(t) ← V_cod)
-    S = DiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    output_type = similarstoragetype(t, real(scalartype(t)))
+    S = DiagonalTensorMap{real(scalartype(t)), typeof(V_cod), output_type}(undef, V_cod)
     Vᴴ = similar(t, V_dom ← domain(t))
     return U, S, Vᴴ
 end
 
 function MAK.initialize_output(::typeof(svd_vals!), t::AbstractTensorMap, alg::AbstractAlgorithm)
     V_cod = infimum(fuse(codomain(t)), fuse(domain(t)))
-    return DiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    output_storage_type = similarstoragetype(t, real(scalartype(t)))
+    return DiagonalTensorMap{real(scalartype(t)), typeof(V_cod), output_storage_type}(undef, V_cod)
 end
 
 # Eigenvalue decomposition
@@ -98,7 +100,7 @@ end
 function MAK.initialize_output(::typeof(eigh_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
     V_D = fuse(domain(t))
     T = real(scalartype(t))
-    D = DiagonalTensorMap{T}(undef, V_D)
+    D = DiagonalTensorMap{T, typeof(V_D), similarstoragetype(t, T)}(undef, V_D)
     V = similar(t, codomain(t) ← V_D)
     return D, V
 end
@@ -106,7 +108,7 @@ end
 function MAK.initialize_output(::typeof(eig_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
     V_D = fuse(domain(t))
     Tc = complex(scalartype(t))
-    D = DiagonalTensorMap{Tc}(undef, V_D)
+    D = DiagonalTensorMap{Tc, typeof(V_D), similarstoragetype(t, Tc)}(undef, V_D)
     V = similar(t, Tc, codomain(t) ← V_D)
     return D, V
 end
@@ -114,13 +116,13 @@ end
 function MAK.initialize_output(::typeof(eigh_vals!), t::AbstractTensorMap, alg::AbstractAlgorithm)
     V_D = fuse(domain(t))
     T = real(scalartype(t))
-    return D = DiagonalTensorMap{Tc}(undef, V_D)
+    return D = DiagonalTensorMap{Tc, typeof(V_D), storagetype(t)}(undef, V_D)
 end
 
 function MAK.initialize_output(::typeof(eig_vals!), t::AbstractTensorMap, alg::AbstractAlgorithm)
     V_D = fuse(domain(t))
     Tc = complex(scalartype(t))
-    return D = DiagonalTensorMap{Tc}(undef, V_D)
+    return D = DiagonalTensorMap{Tc, typeof(V_D), similarstoragetype(t, Tc)}(undef, V_D)
 end
 
 # QR decomposition
diff --git a/src/factorizations/truncation.jl b/src/factorizations/truncation.jl
index 84f2569e0..514801642 100644
--- a/src/factorizations/truncation.jl
+++ b/src/factorizations/truncation.jl
@@ -67,7 +67,7 @@ function MAK.truncate(
 
     Ũ = similar(U, codomain(U) ← V_truncated)
     truncate_domain!(Ũ, U, ind)
-    S̃ = DiagonalTensorMap{scalartype(S)}(undef, V_truncated)
+    S̃ = DiagonalTensorMap{scalartype(S), typeof(V_truncated), storagetype(S)}(undef, V_truncated)
     truncate_diagonal!(S̃, S, ind)
     Ṽᴴ = similar(Vᴴ, V_truncated ← domain(Vᴴ))
     truncate_codomain!(Ṽᴴ, Vᴴ, ind)
@@ -132,7 +132,7 @@ for f! in (:eig_trunc!, :eigh_trunc!)
         ind = MAK.findtruncated(diagview(D), strategy)
         V_truncated = truncate_space(space(D, 1), ind)
 
-        D̃ = DiagonalTensorMap{scalartype(D)}(undef, V_truncated)
+        D̃ = DiagonalTensorMap{scalartype(D), typeof(V_truncated), storagetype(D)}(undef, V_truncated)
         truncate_diagonal!(D̃, D, ind)
 
         Ṽ = similar(V, codomain(V) ← V_truncated)
@@ -162,17 +162,15 @@ function _findnexttruncvalue(
     ) where {I <: Sector}
     # early return
     (isempty(S) || all(iszero, values(truncdim))) && return nothing
+    mapped_keys = map(keys(truncdim)) do c
+        d = truncdim[c]
+        return by(S[c][d])
+    end
     if rev
-        σmin, imin = findmin(keys(truncdim)) do c
-            d = truncdim[c]
-            return by(S[c][d])
-        end
+        σmin, imin = findmin(mapped_keys)
         return σmin, keys(truncdim)[imin]
     else
-        σmax, imax = findmax(keys(truncdim)) do c
-            d = truncdim[c]
-            return by(S[c][d])
-        end
+        σmax, imax = findmax(mapped_keys)
         return σmax, keys(truncdim)[imax]
     end
 end
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index 356f5dbf4..f66dfa1c3 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -271,7 +271,7 @@ function LinearAlgebra.mul!(
         dC::DiagonalTensorMap, dA::DiagonalTensorMap, dB::DiagonalTensorMap, α::Number, β::Number
     )
     dC.domain == dA.domain == dB.domain || throw(SpaceMismatch())
-    mul!(Diagonal(dC.data), Diagonal(dA.data), Diagonal(dB.data), α, β)
+    @. dC.data = (α * dA.data * dB.data) + β * dC.data
     return dC
 end
 
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index 747a44320..f8d09bbf1 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -272,7 +272,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p == 2
         n² = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm(b, 2)^2)
         end
         return sqrt(n²)
     elseif p == 1
@@ -281,7 +281,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p > 0
         nᵖ = mapreduce(+, blockiter; init = init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
+            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm(b, p)^p)
         end
         return (nᵖ)^inv(oftype(nᵖ, p))
     else
@@ -299,7 +299,7 @@ function LinearAlgebra.rank(
     r = 0 * dim(first(allunits(sectortype(t))))
     dim(t) == 0 && return r
     S = LinearAlgebra.svdvals(t)
-    tol = max(atol, rtol * maximum(first, values(S)))
+    tol = max(atol, rtol * mapreduce(maximum, max, values(S)))
     for (c, b) in S
         if !isempty(b)
             r += dim(c) * count(>(tol), b)
@@ -317,8 +317,8 @@ function LinearAlgebra.cond(t::AbstractTensorMap, p::Real = 2)
             return zero(real(float(scalartype(t))))
         end
         S = LinearAlgebra.svdvals(t)
-        maxS = maximum(first, values(S))
-        minS = minimum(last, values(S))
+        maxS = mapreduce(maximum, max, values(S))
+        minS = mapreduce(minimum, min, values(S))
         return iszero(maxS) ? oftype(maxS, Inf) : (maxS / minS)
     else
         throw(ArgumentError("cond currently only defined for p=2"))
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index fe15d819d..8b82cea2e 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -357,7 +357,7 @@ function TensorMap(
     ) where {S}
     return TensorMap(data, codom ← dom; kwargs...)
 end
-function Tensor(data::AbstractArray, codom::TensorSpace, ; kwargs...)
+function Tensor(data::AbstractArray, codom::TensorSpace; kwargs...)
     return TensorMap(data, codom ← one(codom); kwargs...)
 end
 
diff --git a/test/amd/factorizations.jl b/test/amd/factorizations.jl
new file mode 100644
index 000000000..8769f4558
--- /dev/null
+++ b/test/amd/factorizations.jl
@@ -0,0 +1,414 @@
+using Test, TestExtras
+using TensorKit
+using LinearAlgebra: LinearAlgebra
+using AMDGPU
+
+const AMDGPUExt = Base.get_extension(TensorKit, :TensorKitAMDGPUExt)
+@assert !isnothing(AMDGPUExt)
+const ROCTensorMap = getglobal(AMDGPUExt, :ROCTensorMap)
+const ROCDiagonalTensorMap = getglobal(AMDGPUExt, :ROCDiagonalTensorMap)
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₃, VfU₁, VfSU₂)
+        else
+            (Vtr, VU₁, VCU₁, VSU₂, VfSU₂)
+        end
+    else
+        (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)
+    end
+catch
+    (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)
+end
+
+import AMDGPU: rand as rocrand, randn as rocrandn
+
+
+eltypes = (Float32, ComplexF64)
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("Factorizations with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Factorizations with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        W = V1 ⊗ V2
+        @testset "QR decomposition" begin
+            @testset "$(typeof(t))($T)" for T in eltypes,
+                    t in (
+                        rocrand(T, W, W), rocrand(T, W, W)', rocrand(T, W, V1), rocrand(T, V1, W)',
+                        ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1),
+                    )
+
+                Q, R = @constinferred qr_full(t)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isunitary(Q)
+
+                Q, R = @constinferred qr_compact(t)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isisometric(Q)
+
+                Q, R = @constinferred left_orth(t; kind = :qr)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isisometric(Q)
+
+                N = @constinferred qr_null(t)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                N = @constinferred left_null(t; kind = :qr)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes
+                t = rocrand(T, V1 ⊗ V2, zero(V1))
+
+                Q, R = @constinferred qr_full(t)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isunitary(Q)
+                @test dim(R) == dim(t) == 0
+
+                Q, R = @constinferred qr_compact(t)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                Q, R = @constinferred left_orth(t; kind = :qr)
+                AMDGPU.@allowscalar begin
+                    @test Q * R ≈ t
+                end
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                N = @constinferred qr_null(t)
+                @test isunitary(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+        @testset "LQ decomposition" begin
+            for T in eltypes,
+                    t in (
+                        rocrand(T, W, W), rocrand(T, W, W)', rocrand(T, W, V1), rocrand(T, V1, W)',
+                        ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1),
+                    )
+
+                L, Q = @constinferred lq_full(t)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isunitary(Q)
+
+                L, Q = @constinferred lq_compact(t)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isisometric(Q; side = :right)
+
+                L, Q = @constinferred right_orth(t; kind = :lq)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isisometric(Q; side = :right)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            for T in eltypes
+                # empty tensor
+                t = rocrand(T, zero(V1), V1 ⊗ V2)
+
+                L, Q = @constinferred lq_full(t)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isunitary(Q)
+                @test dim(L) == dim(t) == 0
+
+                L, Q = @constinferred lq_compact(t)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                L, Q = @constinferred right_orth(t; kind = :lq)
+                AMDGPU.@allowscalar begin
+                    @test L * Q ≈ t
+                end
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isunitary(Nᴴ)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+        @testset "Polar decomposition" begin
+            @testset "$(typeof(t))($T)" for T in eltypes,
+                    t in (
+                        rocrand(T, W, W),
+                        rocrand(T, W, W)',
+                        rocrand(T, W, V1),
+                        rocrand(T, V1, W)',
+                        ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1),
+                    )
+
+                @assert domain(t) ≾ codomain(t)
+                w, p = @constinferred left_polar(t)
+                @test w * p ≈ t
+                @test isisometric(w)
+                @test isposdef(p)
+
+                w, p = @constinferred left_orth(t; kind = :polar)
+                @test w * p ≈ t
+                @test isisometric(w)
+            end
+
+            @testset "$(typeof(t))($T)" for T in eltypes,
+                    t in (
+                        rocrand(T, W, W), rocrand(T, W, W)', rocrand(T, V1, W), rocrand(T, W, V1)',
+                        ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1),
+                    )
+
+                @assert codomain(t) ≾ domain(t)
+                p, wᴴ = @constinferred right_polar(t)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+                @test isposdef(p)
+
+                p, wᴴ = @constinferred right_orth(t; kind = :polar)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+            end
+        end
+        @testset "SVD" begin
+            for T in eltypes,
+                    t in (
+                        rocrand(T, W, W), rocrand(T, W, W)',
+                        rocrand(T, W, V1), rocrand(T, V1, W),
+                        rocrand(T, W, V1)', rocrand(T, V1, W)',
+                        ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1),
+                    )
+
+                u, s, vᴴ = @constinferred svd_full(t)
+                @test u * s * vᴴ ≈ t
+                @test isunitary(u)
+                @test isunitary(vᴴ)
+
+                u, s, vᴴ = @constinferred svd_compact(t)
+                @test u * s * vᴴ ≈ t
+                @test isisometric(u)
+                @test isposdef(s)
+                @test isisometric(vᴴ; side = :right)
+
+                s′ = LinearAlgebra.diag(s)
+                for (c, b) in LinearAlgebra.svdvals(t)
+                    @test b ≈ s′[c]
+                end
+
+                v, c = @constinferred left_orth(t; kind = :svd)
+                @test v * c ≈ t
+                @test isisometric(v)
+
+                N = @constinferred left_null(t; kind = :svd)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                Nᴴ = @constinferred right_null(t; kind = :svd)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes, t in (rocrand(T, W, zero(V1)), rocrand(T, zero(V1), W))
+                U, S, Vᴴ = @constinferred svd_full(t)
+                @test U * S * Vᴴ ≈ t
+                @test isunitary(U)
+                @test isunitary(Vᴴ)
+
+                U, S, Vᴴ = @constinferred svd_compact(t)
+                @test U * S * Vᴴ ≈ t
+                @test dim(U) == dim(S) == dim(Vᴴ) == dim(t) == 0
+            end
+        end
+
+        @testset "truncated SVD" begin
+            for T in eltypes,
+                    # AMDGPU doesn't support randn(ComplexF64)
+                    t in (
+                        ROCTensorMap(randn(T, W, W)),
+                        ROCTensorMap(randn(T, W, W))',
+                        ROCTensorMap(randn(T, W, V1)),
+                        ROCTensorMap(randn(T, V1, W)),
+                        ROCTensorMap(randn(T, W, V1))',
+                        ROCTensorMap(randn(T, V1, W))',
+                        ROCDiagonalTensorMap(randn(T, reduceddim(V1)), V1),
+                    )
+
+                @constinferred normalize!(t)
+
+                U, S, Vᴴ = @constinferred svd_trunc(t; trunc = notrunc())
+                @test U * S * Vᴴ ≈ t
+                @test isisometric(U)
+                @test isisometric(Vᴴ; side = :right)
+
+                trunc = truncrank(dim(domain(S)) ÷ 2)
+                AMDGPU.@allowscalar begin
+                    U1, S1, Vᴴ1 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ1' ≈ U1 * S1
+                @test isisometric(U1)
+                @test isisometric(Vᴴ1; side = :right)
+                @test dim(domain(S1)) <= trunc.howmany
+
+                λ = minimum(minimum, values(LinearAlgebra.diag(S1)))
+                trunc = trunctol(; atol = λ - 10eps(λ))
+                AMDGPU.@allowscalar begin
+                    U2, S2, Vᴴ2 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ2' ≈ U2 * S2
+                @test isisometric(U2)
+                @test isisometric(Vᴴ2; side = :right)
+                @test minimum(minimum, values(LinearAlgebra.diag(S1))) >= λ
+                @test U2 ≈ U1
+                @test S2 ≈ S1
+                @test Vᴴ2 ≈ Vᴴ1
+
+                trunc = truncspace(space(S2, 1))
+                AMDGPU.@allowscalar begin
+                    U3, S3, Vᴴ3 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ3' ≈ U3 * S3
+                @test isisometric(U3)
+                @test isisometric(Vᴴ3; side = :right)
+                @test space(S3, 1) ≾ space(S2, 1)
+
+                trunc = truncerror(; atol = 0.5)
+                AMDGPU.@allowscalar begin
+                    U4, S4, Vᴴ4 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ4' ≈ U4 * S4
+                @test isisometric(U4)
+                @test isisometric(Vᴴ4; side = :right)
+                @test norm(t - U4 * S4 * Vᴴ4) <= 0.5
+            end
+        end
+
+        #=@testset "Eigenvalue decomposition" begin
+            for T in eltypes,
+                t in (rocrand(T, V1, V1),
+                      rocrand(T, W, W),
+                      rocrand(T, W, W)',
+                      ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1)
+                     )
+
+                d, v = @constinferred eig_full(t)
+                @test t * v ≈ v * d
+
+                d′ = LinearAlgebra.diag(d)
+                for (c, b) in LinearAlgebra.eigvals(t)
+                    @test sort(b; by=abs) ≈ sort(d′[c]; by=abs)
+                end
+
+                vdv = v' * v
+                vdv = (vdv + vdv') / 2
+                @test @constinferred isposdef(vdv)
+                t isa ROCDiagonalTensorMap || @test !isposdef(t) # unlikely for non-hermitian map
+
+                AMDGPU.@allowscalar begin # TODO
+                    d, v = @constinferred eig_trunc(t; trunc=truncrank(dim(domain(t)) ÷ 2))
+                end
+                @test t * v ≈ v * d
+                @test dim(domain(d)) ≤ dim(domain(t)) ÷ 2
+
+                t2 = (t + t')
+                D, V = eigen(t2)
+                @test isisometric(V)
+                D̃, Ṽ = @constinferred eigh_full(t2)
+                @test D ≈ D̃
+                @test V ≈ Ṽ
+                λ = minimum(minimum(real(LinearAlgebra.diag(b)))
+                            for (c, b) in blocks(D))
+                @test cond(Ṽ) ≈ one(real(T))
+                @test isposdef(t2) == isposdef(λ)
+                @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+
+                add!(t, t')
+
+                d, v = @constinferred eigh_full(t)
+                @test t * v ≈ v * d
+                @test isunitary(v)
+
+                λ = minimum(minimum(real(LinearAlgebra.diag(b))) for (c, b) in blocks(d))
+                @test cond(v) ≈ one(real(T))
+                @test isposdef(t) == isposdef(λ)
+                @test isposdef(t - λ * one(t) + 0.1 * one(t))
+                @test !isposdef(t - λ * one(t) - 0.1 * one(t))
+
+                AMDGPU.@allowscalar begin
+                    d, v = @constinferred eigh_trunc(t; trunc=truncrank(dim(domain(t)) ÷ 2))
+                end
+                @test t * v ≈ v * d
+                @test dim(domain(d)) ≤ dim(domain(t)) ÷ 2
+            end
+        end=#
+
+        #=@testset "Condition number and rank" begin
+            for T in eltypes,
+                t in (rocrand(T, W, W), rocrand(T, W, W)',
+                      rocrand(T, W, V1), rocrand(T, V1, W),
+                      rocrand(T, W, V1)', rocrand(T, V1, W)',
+                      ROCDiagonalTensorMap(rocrand(T, reduceddim(V1)), V1))
+
+                d1, d2 = dim(codomain(t)), dim(domain(t))
+                @test rank(t) == min(d1, d2)
+                M = left_null(t)
+                @test @constinferred(rank(M)) + rank(t) == d1
+                Mᴴ = right_null(t)
+                @test rank(Mᴴ) + rank(t) == d2
+            end
+            for T in eltypes
+                u = unitary(T, V1 ⊗ V2, V1 ⊗ V2)
+                @test @constinferred(cond(u)) ≈ one(real(T))
+                @test @constinferred(rank(u)) == dim(V1 ⊗ V2)
+
+                t = rocrand(T, zero(V1), W)
+                @test rank(t) == 0
+                t2 = rocrand(T, zero(V1) * zero(V2), zero(V1) * zero(V2))
+                @test rank(t2) == 0
+                @test cond(t2) == 0.0
+            end
+            for T in eltypes, t in (rocrand(T, W, W), rocrand(T, W, W)')
+                add!(t, t')
+                vals = @constinferred LinearAlgebra.eigvals(t)
+                λmax = maximum(s -> maximum(abs, s), values(vals))
+                λmin = minimum(s -> minimum(abs, s), values(vals))
+                @test cond(t) ≈ λmax / λmin
+            end
+        end=#
+    end
+end
diff --git a/test/amd/tensors.jl b/test/amd/tensors.jl
new file mode 100644
index 000000000..7c93732eb
--- /dev/null
+++ b/test/amd/tensors.jl
@@ -0,0 +1,614 @@
+using Adapt, AMDGPU
+using Test, TestExtras
+using TensorKit, Combinatorics
+ad = adapt(Array)
+
+const AMDGPUExt = Base.get_extension(TensorKit, :TensorKitAMDGPUExt)
+@assert !isnothing(AMDGPUExt)
+const ROCTensorMap = getglobal(AMDGPUExt, :ROCTensorMap)
+const ROCDiagonalTensorMap = getglobal(AMDGPUExt, :ROCDiagonalTensorMap)
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂) #, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("AMDGPU Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred AMDGPU.zeros(T, W)
+                AMDGPU.@allowscalar begin
+                    @test @constinferred(hash(t)) == hash(deepcopy(t))
+                end
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, ROCVector{T, AMDGPU.Mem.HIPBuffer}}
+                # blocks
+                bs = @constinferred blocks(t)
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t, first(blocksectors(t)))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test typeof(c) === sectortype(t)
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(ROCTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Tensor Array conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                W1 = V1 ← one(V1)
+                W2 = one(V2) ← V2
+                W3 = V1 ⊗ V2 ← one(V1)
+                W4 = V1 ← V2
+                W5 = one(V1) ← V1 ⊗ V2
+                W6 = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for W in (W1, W2, W3, W4, W5, W6)
+                    for T in (Int, Float32, ComplexF64)
+                        if T == Int
+                            t = ROCTensorMap{T}(undef, W)
+                            for (_, b) in blocks(t)
+                                AMDGPU.@allowscalar AMDGPU.rand!(b, -20:20)
+                            end
+                        else
+                            t = @constinferred AMDGPU.randn(T, W)
+                        end
+                        AMDGPU.@allowscalar begin
+                            a = @constinferred convert(ROCArray, t)
+                        end
+                        b = reshape(a, dim(codomain(W)), dim(domain(W)))
+                        @test t ≈ @constinferred TensorMap(a, W)
+                        @test t ≈ @constinferred TensorMap(b, W)
+                        @test t === @constinferred TensorMap(t.data, W)
+                    end
+                end
+                for T in (Int, Float32, ComplexF64)
+                    t = AMDGPU.randn(T, V1 ⊗ V2 ← zero(V1))
+                    AMDGPU.@allowscalar begin
+                        a = convert(ROCArray, t)
+                    end
+                    @test norm(a) == 0
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                # blocks for adjoint
+                bs = @constinferred blocks(t')
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W'))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t', first(blocksectors(t')))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test typeof(c) === sectortype(t)
+                # linear algebra
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                AMDGPU.@allowscalar begin
+                    @test norm(t + t, p) ≈ 2 * norm(t, p)
+                end
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred AMDGPU.rand!(similar(t))
+                β = rand(T)
+                @test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                i1 = @constinferred(isomorphism(T, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(Vector{T}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(T, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(Vector{T}, V2 ⊗ V1))
+
+                w = @constinferred(isometry(T, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(Vector{T}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        @timedtestset "Trivial space insertion and removal" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                t2 = @constinferred insertleftunit(t)
+                @test t2 == @constinferred insertrightunit(t)
+                @test numind(t2) == numind(t) + 1
+                @test space(t2) == insertleftunit(space(t))
+                @test scalartype(t2) === T
+                @test t.data === t2.data
+                @test @constinferred(removeunit(t2, $(numind(t2)))) == t
+                t3 = @constinferred insertleftunit(t; copy = true)
+                @test t3 == @constinferred insertrightunit(t; copy = true)
+                @test t.data !== t3.data
+                for (c, b) in blocks(t)
+                    @test b == block(t3, c)
+                end
+                @test @constinferred(removeunit(t3, $(numind(t3)))) == t
+                t4 = @constinferred insertrightunit(t, 3; dual = true)
+                @test numin(t4) == numin(t) && numout(t4) == numout(t) + 1
+                for (c, b) in blocks(t)
+                    @test b == block(t4, c)
+                end
+                @test @constinferred(removeunit(t4, 4)) == t
+                t5 = @constinferred insertleftunit(t, 4; dual = true)
+                @test numin(t5) == numin(t) + 1 && numout(t5) == numout(t)
+                for (c, b) in blocks(t)
+                    @test b == block(t5, c)
+                end
+                @test @constinferred(removeunit(t5, 4)) == t
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = AMDGPU.rand(T, W)
+                    t2 = @constinferred AMDGPU.rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(ad(t), 2)
+                    @test dot(t2, t) ≈ dot(ad(t2), ad(t))
+                    α = rand(T)
+                    @test ad(α * t) ≈ α * ad(t)
+                    @test ad(t + t) ≈ 2 * ad(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred AMDGPU.randn(T, W, W)
+                    @test real(ad(t)) == ad(@constinferred real(t))
+                    @test imag(ad(t)) == ad(@constinferred imag(t))
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred AMDGPU.randn(W ← W)
+            @test typeof(convert(ROCTensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = AMDGPU.rand(ComplexF64, W)
+            t′ = AMDGPU.randn!(similar(t))
+            AMDGPU.@allowscalar begin
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = @constinferred permute(t, (p1, p2))
+                        t2 = permute(t, (p1, p2))
+                        @test norm(t2) ≈ norm(t)
+                        t2′ = permute(t′, (p1, p2))
+                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    end
+                    t3 = @constinferred repartition(t, $k)
+                    t3 = repartition(t, k)
+                    @test norm(t3) ≈ norm(t)
+                    t3′ = @constinferred repartition!(similar(t3), t′)
+                    @test norm(t3′) ≈ norm(t′)
+                    @test dot(t′, t) ≈ dot(t3′, t3)
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = AMDGPU.rand(ComplexF64, W)
+                a = ad(t)
+                AMDGPU.@allowscalar begin
+                    for k in 0:5
+                        for p in permutations(1:5)
+                            p1 = ntuple(n -> p[n], k)
+                            p2 = ntuple(n -> p[k + n], 5 - k)
+                            t2 = permute(t, (p1, p2))
+                            a2 = ad(t2)
+                            @test a2 ≈ permute(a, (p1, p2))
+                            @test ad(transpose(t2)) ≈ transpose(a2)
+                        end
+
+                        t3 = repartition(t, k)
+                        a3 = ad(t3)
+                        @test a3 ≈ repartition(a, k)
+                    end
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            AMDGPU.@allowscalar begin
+                t2 = permute(t, ((1, 2), (4, 3)))
+                s = @constinferred tr(t2)
+            end
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            AMDGPU.@allowscalar begin
+                ss = tr(t2)
+                @tensor s2 = t[a, b, b, a]
+                @tensor t3[a, b] := t[a, c, c, b]
+                @tensor s3 = t3[a, a]
+            end
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            AMDGPU.@allowscalar begin
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+                @tensor t5[a, b] := t4[a, b, c, c]
+            end
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                AMDGPU.@allowscalar begin
+                    @tensor t2[a, b] := t[c, d, b, d, c, a]
+                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                end
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            AMDGPU.@allowscalar begin
+                t3 = t1 ⊗ t2
+                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+                @tensor tb[a, b] := t3[x, y, a, y, b, x]
+                @test ta ≈ tb
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = AMDGPU.randn(ComplexF64, V1' * V2', V3')
+                A2 = AMDGPU.randn(ComplexF64, V3 * V4, V5)
+                rhoL = AMDGPU.randn(ComplexF64, V1, V1)
+                rhoR = AMDGPU.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = AMDGPU.randn(ComplexF64, V2 * V4, V2 * V4)
+                AMDGPU.@allowscalar begin
+                    @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                        A2[b, t2, c'] * rhoR[c', c] *
+                        H[s1, s2, t1, t2]
+
+                    @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                        conj(ad(A1)[a', t1, b]) *
+                        ad(A2)[b, t2, c'] *
+                        ad(rhoR)[c', c] *
+                        ad(H)[s1, s2, t1, t2]
+                end
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Index flipping: test flipping inverse" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            AMDGPU.@allowscalar begin
+                for i in 1:4
+                    @test t ≈ flip(flip(t, i), i; inv = true)
+                    @test t ≈ flip(flip(t, i; inv = true), i)
+                end
+            end
+        end
+        #=
+        @timedtestset "Index flipping: test via explicit flip" begin
+            t  = AMDGPU.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            F1 = unitary(flip(V1), V1) # TODO wrong storage type
+            AMDGPU.@allowscalar begin
+                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+                @test flip(t, 1) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+                @test twist!(flip(t, 2), 2) ≈ tf
+                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+                @test flip(t, 3) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+                @test twist!(flip(t, 4), 4) ≈ tf
+            end
+        end =#
+        @timedtestset "Index flipping: test via contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            AMDGPU.@allowscalar begin
+                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+                @test flip(ta, (1, 2)) ≈ tb
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = AMDGPU.rand(T, W1, W1)
+                t2 = AMDGPU.rand(T, W2, W2)
+                t = AMDGPU.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                AMDGPU.@allowscalar begin
+                    @test t1 \ one(t1) ≈ inv(t1)
+                end
+                # @test one(t1) / t1 ≈ pinv(t1) # pinv not available in AMDGPU
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                # pinv not available in AMDGPU
+                # tp = pinv(t) * t
+                # @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = AMDGPU.rand(T, W1, W1)
+                    t2 = AMDGPU.rand(T, W2, W2)
+                    t = AMDGPU.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    AMDGPU.@allowscalar begin
+                        At1 = reshape(convert(ROCArray, t1), d1, d1)
+                        At2 = reshape(convert(ROCArray, t2), d2, d2)
+                        At = reshape(convert(ROCArray, t), d1, d2)
+                    end
+                    @test ad(t1 * t) ≈ ad(t1) * ad(t)
+                    @test ad(t1' * t) ≈ ad(t1)' * ad(t)
+                    @test ad(t2 * t') ≈ ad(t2) * ad(t)'
+                    @test ad(t2' * t') ≈ ad(t2)' * ad(t)'
+                    @test ad(inv(t1)) ≈ inv(ad(t1))
+                    # pinv not available in AMDGPU
+                    #@test ad(pinv(t)) ≈ pinv(ad(t))
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test ad(t1 \ t) ≈ ad(t1) \ ad(t)
+                    @test ad(t1' \ t) ≈ ad(t1)' \ ad(t)
+                    @test ad(t2 \ t') ≈ ad(t2) \ ad(t)'
+                    @test ad(t2' \ t') ≈ ad(t2)' \ ad(t)'
+
+                    @test ad(t2 / t) ≈ ad(t2) / ad(t)
+                    @test ad(t2' / t) ≈ ad(t2)' / ad(t)
+                    @test ad(t1 / t') ≈ ad(t1) / ad(t)'
+                    @test ad(t1' / t') ≈ ad(t1)' / ad(t)'
+                end
+            end
+        end
+        #=
+        @timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = AMDGPU.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=# # TODO
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = AMDGPU.randn(T, W, W)
+                    s = dim(W)
+                    AMDGPU.@allowscalar begin
+                        expt = @constinferred exp(t)
+                        @test ad(expt) ≈ exp(ad(t))
+
+                        @test (@constinferred sqrt(t))^2 ≈ t
+                        @test ad(sqrt(t)) ≈ sqrt(ad(t))
+
+                        @test exp(@constinferred log(expt)) ≈ expt
+                        @test ad(log(expt)) ≈ log(ad(expt))
+
+                        @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                            id(storagetype(t), W)
+                        @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                        @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                        @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                            id(storagetype(t), W)
+                        @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                        @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                        t1 = sin(t)
+                        @test sin(@constinferred asin(t1)) ≈ t1
+                        t2 = cos(t)
+                        @test cos(@constinferred acos(t2)) ≈ t2
+                        t3 = sinh(t)
+                        @test sinh(@constinferred asinh(t3)) ≈ t3
+                        t4 = cosh(t)
+                        @test cosh(@constinferred acosh(t4)) ≈ t4
+                        t5 = tan(t)
+                        @test tan(@constinferred atan(t5)) ≈ t5
+                        t6 = cot(t)
+                        @test cot(@constinferred acot(t6)) ≈ t6
+                        t7 = tanh(t)
+                        @test tanh(@constinferred atanh(t7)) ≈ t7
+                        t8 = coth(t)
+                        @test coth(@constinferred acoth(t8)) ≈ t8
+                    end
+                end
+            end
+        end
+        # Sylvester not defined for AMDGPU
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = AMDGPU.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = AMDGPU.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = AMDGPU.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                AMDGPU.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                end
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    AMDGPU.@allowscalar begin
+                        t = @constinferred (t1 ⊗ t2)
+                    end
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    At = ad(t)
+                    AMDGPU.@allowscalar begin
+                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                    end
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3)
+                AMDGPU.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                    @test t ≈ t′
+                end
+            end
+        end
+        @timedtestset "Tensor absorpsion" begin
+            # absorbing small into large
+            t1 = AMDGPU.zeros(V1 ⊕ V1, V2 ⊗ V3)
+            t2 = AMDGPU.rand(V1, V2 ⊗ V3)
+            AMDGPU.@allowscalar begin
+                t3 = @constinferred absorb(t1, t2)
+            end
+            @test norm(t3) ≈ norm(t2)
+            @test norm(t1) == 0
+            AMDGPU.@allowscalar begin
+                t4 = @constinferred absorb!(t1, t2)
+            end
+            @test t1 === t4
+            @test t3 ≈ t4
+
+            # absorbing large into small
+            t1 = AMDGPU.rand(V1 ⊕ V1, V2 ⊗ V3)
+            t2 = AMDGPU.zeros(V1, V2 ⊗ V3)
+            AMDGPU.@allowscalar begin
+                t3 = @constinferred absorb(t2, t1)
+            end
+            @test norm(t3) < norm(t1)
+            @test norm(t2) == 0
+            AMDGPU.@allowscalar begin
+                t4 = @constinferred absorb!(t2, t1)
+            end
+            @test t2 === t4
+            @test t3 ≈ t4
+        end
+    end
+    TensorKit.empty_globalcaches!()
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = AMDGPU.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = AMDGPU.rand(T, W2, W1 ⊗ W1')
+            t = @constinferred (t1 ⊠ t2)
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+        end
+    end
+end
diff --git a/test/cuda/factorizations.jl b/test/cuda/factorizations.jl
new file mode 100644
index 000000000..d095d270d
--- /dev/null
+++ b/test/cuda/factorizations.jl
@@ -0,0 +1,450 @@
+using Test, TestExtras
+using TensorKit
+using LinearAlgebra: LinearAlgebra
+using CUDA, cuTENSOR
+
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+const CuDiagonalTensorMap = getglobal(CUDAExt, :CuDiagonalTensorMap)
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₃, VfU₁, VfSU₂)
+        else
+            (Vtr, VU₁, VCU₁, VSU₂, VfSU₂)
+        end
+    else
+        (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)
+    end
+catch
+    (Vtr, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)
+end
+
+import CUDA: rand as curand, randn as curandn
+
+
+eltypes = (Float32, ComplexF64)
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Factorizations with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "CUDA Factorizations with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        W = V1 ⊗ V2
+        @testset "QR decomposition" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, W, W), curand(T, W, W)', curand(T, W, V1), curand(T, V1, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                Q, R = @constinferred qr_full(t)
+                @test Q * R ≈ t
+                @test isunitary(Q)
+
+                Q, R = @constinferred qr_compact(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+
+                Q, R = @constinferred left_orth(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+
+                N = @constinferred qr_null(t)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                N = @constinferred left_null(t)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes
+                t = curand(T, V1 ⊗ V2, zero(V1))
+
+                Q, R = @constinferred qr_full(t)
+                @test Q * R ≈ t
+                @test isunitary(Q)
+                @test dim(R) == dim(t) == 0
+
+                Q, R = @constinferred qr_compact(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                Q, R = @constinferred left_orth(t)
+                @test Q * R ≈ t
+                @test isisometric(Q)
+                @test dim(Q) == dim(R) == dim(t)
+
+                N = @constinferred qr_null(t)
+                @test isunitary(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+        @testset "LQ decomposition" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, W, W), curand(T, W, W)', curand(T, W, V1), curand(T, V1, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                L, Q = @constinferred lq_full(t)
+                @test L * Q ≈ t
+                @test isunitary(Q)
+
+                L, Q = @constinferred lq_compact(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+
+                L, Q = @constinferred right_orth(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            for T in eltypes
+                # empty tensor
+                t = curand(T, zero(V1), V1 ⊗ V2)
+
+                L, Q = @constinferred lq_full(t)
+                @test L * Q ≈ t
+                @test isunitary(Q)
+                @test dim(L) == dim(t) == 0
+
+                L, Q = @constinferred lq_compact(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                L, Q = @constinferred right_orth(t)
+                @test L * Q ≈ t
+                @test isisometric(Q; side = :right)
+                @test dim(Q) == dim(L) == dim(t)
+
+                Nᴴ = @constinferred lq_null(t)
+                @test isunitary(Nᴴ)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+        end
+        @testset "Polar decomposition" begin
+            @testset for T in eltypes,
+                    t in (
+                        curand(T, W, W),
+                        curand(T, W, W)',
+                        curand(T, W, V1),
+                        curand(T, V1, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                @assert domain(t) ≾ codomain(t)
+                w, p = @constinferred left_polar(t)
+                @test w * p ≈ t
+                @test isisometric(w)
+                @test isposdef(project_hermitian!(p))
+
+                w, p = @constinferred left_orth(t; alg = :polar)
+                @test w * p ≈ t
+                @test isisometric(w)
+            end
+
+            @testset for T in eltypes,
+                    t in (
+                        curand(T, W, W), curand(T, W, W)', curand(T, V1, W), curand(T, W, V1)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                @assert codomain(t) ≾ domain(t)
+                p, wᴴ = @constinferred right_polar(t)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+                @test isposdef(project_hermitian!(p))
+
+                p, wᴴ = @constinferred right_orth(t; alg = :polar)
+                @test p * wᴴ ≈ t
+                @test isisometric(wᴴ; side = :right)
+            end
+        end
+        @testset "SVD" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, W, W), curand(T, W, W)',
+                        curand(T, W, V1), curand(T, V1, W),
+                        curand(T, W, V1)', curand(T, V1, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                u, s, vᴴ = @constinferred svd_full(t)
+                @test u * s * vᴴ ≈ t
+                @test isunitary(u)
+                @test isunitary(vᴴ)
+
+                u, s, vᴴ = @constinferred svd_compact(t)
+                @test u * s * vᴴ ≈ t
+                @test isisometric(u)
+                @test isposdef(s)
+                @test isisometric(vᴴ; side = :right)
+
+                s′ = LinearAlgebra.diag(s)
+                for (c, b) in LinearAlgebra.svdvals(t)
+                    @test b ≈ s′[c]
+                end
+
+                v, c = @constinferred left_orth(t; alg = :svd)
+                @test v * c ≈ t
+                @test isisometric(v)
+
+                N = @constinferred left_null(t; alg = :svd)
+                @test isisometric(N)
+                @test norm(N' * t) ≈ 0 atol = 100 * eps(norm(t))
+
+                Nᴴ = @constinferred right_null(t; alg = :svd)
+                @test isisometric(Nᴴ; side = :right)
+                @test norm(t * Nᴴ') ≈ 0 atol = 100 * eps(norm(t))
+            end
+
+            # empty tensor
+            for T in eltypes, t in (curand(T, W, zero(V1)), curand(T, zero(V1), W))
+                U, S, Vᴴ = @constinferred svd_full(t)
+                @test U * S * Vᴴ ≈ t
+                @test isunitary(U)
+                @test isunitary(Vᴴ)
+
+                U, S, Vᴴ = @constinferred svd_compact(t)
+                @test U * S * Vᴴ ≈ t
+                @test dim(U) == dim(S) == dim(Vᴴ) == dim(t) == 0
+            end
+        end
+
+        @testset "truncated SVD" begin
+            for T in eltypes,
+                    t in (
+                        curandn(T, W, W), curandn(T, W, W)',
+                        curandn(T, W, V1), curandn(T, V1, W),
+                        curandn(T, W, V1)', curandn(T, V1, W)',
+                        CuDiagonalTensorMap(curandn(T, reduceddim(V1)), V1),
+                    )
+
+                @constinferred normalize!(t)
+
+                CUDA.@allowscalar begin
+                    U, S, Vᴴ = @constinferred svd_trunc(t; trunc = notrunc())
+                end
+                @test U * S * Vᴴ ≈ t
+                @test isisometric(U)
+                @test isisometric(Vᴴ; side = :right)
+
+                trunc = truncrank(dim(domain(S)) ÷ 2)
+                CUDA.@allowscalar begin
+                    U1, S1, Vᴴ1 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ1' ≈ U1 * S1
+                @test isisometric(U1)
+                @test isisometric(Vᴴ1; side = :right)
+                @test dim(domain(S1)) <= trunc.howmany
+
+                λ = minimum(minimum, values(LinearAlgebra.diag(S1)))
+                trunc = trunctol(; atol = λ - 10eps(λ))
+                CUDA.@allowscalar begin
+                    U2, S2, Vᴴ2 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ2' ≈ U2 * S2
+                @test isisometric(U2)
+                @test isisometric(Vᴴ2; side = :right)
+                @test minimum(minimum, values(LinearAlgebra.diag(S1))) >= λ
+                @test U2 ≈ U1
+                @test S2 ≈ S1
+                @test Vᴴ2 ≈ Vᴴ1
+
+                trunc = truncspace(space(S2, 1))
+                CUDA.@allowscalar begin
+                    U3, S3, Vᴴ3 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ3' ≈ U3 * S3
+                @test isisometric(U3)
+                @test isisometric(Vᴴ3; side = :right)
+                @test space(S3, 1) ≾ space(S2, 1)
+
+                trunc = truncerror(; atol = 0.5)
+                CUDA.@allowscalar begin
+                    U4, S4, Vᴴ4 = @constinferred svd_trunc(t; trunc)
+                end
+                @test t * Vᴴ4' ≈ U4 * S4
+                @test isisometric(U4)
+                @test isisometric(Vᴴ4; side = :right)
+                @test norm(t - U4 * S4 * Vᴴ4) <= 0.5
+            end
+        end
+
+        @testset "Eigenvalue decomposition" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, V1, V1),
+                        curand(T, W, W),
+                        curand(T, W, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                d, v = @constinferred eig_full(t)
+                @test t * v ≈ v * d
+
+                d′ = LinearAlgebra.diag(d)
+                for (c, b) in LinearAlgebra.eigvals(t)
+                    @test sort(b; by = abs) ≈ sort(d′[c]; by = abs)
+                end
+
+                vdv = v' * v
+                vdv = (vdv + vdv') / 2
+                @test @constinferred isposdef(vdv)
+                t isa CuDiagonalTensorMap || @test !isposdef(t) # unlikely for non-hermitian map
+
+                CUDA.@allowscalar begin # TODO
+                    d, v = @constinferred eig_trunc(t; trunc = truncrank(dim(domain(t)) ÷ 2))
+                end
+                @test t * v ≈ v * d
+                @test dim(domain(d)) ≤ dim(domain(t)) ÷ 2
+
+                t2 = (t + t')
+                D, V = eigen(t2)
+                @test isisometric(V)
+                D̃, Ṽ = @constinferred eigh_full(t2)
+                @test D ≈ D̃
+                @test V ≈ Ṽ
+                λ = minimum(
+                    minimum(real(LinearAlgebra.diag(b)))
+                        for (c, b) in blocks(D)
+                )
+                @test cond(Ṽ) ≈ one(real(T))
+                @test isposdef(t2) == isposdef(λ)
+                @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+
+                add!(t, t')
+
+                d, v = @constinferred eigh_full(t)
+                @test t * v ≈ v * d
+                @test isunitary(v)
+
+                λ = minimum(minimum(real(LinearAlgebra.diag(b))) for (c, b) in blocks(d))
+                @test cond(v) ≈ one(real(T))
+                @test isposdef(t) == isposdef(λ)
+                @test isposdef(t - λ * one(t) + 0.1 * one(t))
+                @test !isposdef(t - λ * one(t) - 0.1 * one(t))
+
+                CUDA.@allowscalar begin
+                    d, v = @constinferred eigh_trunc(t; trunc = truncrank(dim(domain(t)) ÷ 2))
+                end
+                @test t * v ≈ v * d
+                @test dim(domain(d)) ≤ dim(domain(t)) ÷ 2
+            end
+        end
+
+        @testset "Condition number and rank" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, W, W), curand(T, W, W)',
+                        curand(T, W, V1), curand(T, V1, W),
+                        curand(T, W, V1)', curand(T, V1, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+
+                d1, d2 = dim(codomain(t)), dim(domain(t))
+                @test rank(t) == min(d1, d2)
+                M = left_null(t)
+                @test @constinferred(rank(M)) + rank(t) == d1
+                Mᴴ = right_null(t)
+                @test rank(Mᴴ) + rank(t) == d2
+            end
+            for T in eltypes
+                u = unitary(T, V1 ⊗ V2, V1 ⊗ V2)
+                @test @constinferred(cond(u)) ≈ one(real(T))
+                @test @constinferred(rank(u)) == dim(V1 ⊗ V2)
+
+                t = curand(T, zero(V1), W)
+                @test rank(t) == 0
+                t2 = curand(T, zero(V1) * zero(V2), zero(V1) * zero(V2))
+                @test rank(t2) == 0
+                @test cond(t2) == 0.0
+            end
+            for T in eltypes, t in (curand(T, W, W), curand(T, W, W)')
+                add!(t, t')
+                vals = @constinferred LinearAlgebra.eigvals(t)
+                λmax = maximum(s -> maximum(abs, s), values(vals))
+                λmin = minimum(s -> minimum(abs, s), values(vals))
+                @test cond(t) ≈ λmax / λmin
+            end
+        end
+
+        @testset "Hermitian projections" begin
+            for T in eltypes,
+                    t in (
+                        curand(T, V1, V1), curand(T, W, W), curand(T, W, W)',
+                        CuDiagonalTensorMap(curand(T, reduceddim(V1)), V1),
+                    )
+                normalize!(t)
+                noisefactor = eps(real(T))^(3 / 4)
+
+                th = (t + t') / 2
+                ta = (t - t') / 2
+                tc = copy(t)
+
+                th′ = @constinferred project_hermitian(t)
+                @test ishermitian(th′)
+                @test th′ ≈ th
+                @test t == tc
+                th_approx = th + noisefactor * ta
+                @test !ishermitian(th_approx) || (T <: Real && t isa CuDiagonalTensorMap)
+                @test ishermitian(th_approx; atol = 10 * noisefactor)
+
+                ta′ = project_antihermitian(t)
+                @test isantihermitian(ta′)
+                @test ta′ ≈ ta
+                @test t == tc
+                ta_approx = ta + noisefactor * th
+                @test !isantihermitian(ta_approx)
+                @test isantihermitian(ta_approx; atol = 10 * noisefactor) || (T <: Real && t isa CuDiagonalTensorMap)
+            end
+        end
+
+        @testset "Isometric projections" begin
+            for T in eltypes,
+                    t in (
+                        curandn(T, W, W), curandn(T, W, W)',
+                        curandn(T, W, V1), curandn(T, V1, W)',
+                    )
+                t2 = project_isometric(t)
+                @test isisometric(t2)
+                t3 = project_isometric(t2)
+                @test t3 ≈ t2 # stability of the projection
+                @test t2 * (t2' * t) ≈ t
+
+                tc = similar(t)
+                t3 = @constinferred project_isometric!(copy!(tc, t), t2)
+                @test t3 === t2
+                @test isisometric(t2)
+
+                # test that t2 is closer to A then any other isometry
+                for k in 1:10
+                    δt = randn!(similar(t))
+                    t3 = project_isometric(t + δt / 100)
+                    @test norm(t - t3) > norm(t - t2)
+                end
+            end
+        end
+    end
+end
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
new file mode 100644
index 000000000..147d8c8af
--- /dev/null
+++ b/test/cuda/tensors.jl
@@ -0,0 +1,608 @@
+using Adapt, CUDA, cuTENSOR
+using Test, TestExtras
+using TensorKit, Combinatorics
+ad = adapt(Array)
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+const CuDiagonalTensorMap = getglobal(CUDAExt, :CuDiagonalTensorMap)
+
+@isdefined(TestSetup) || include("../setup.jl")
+using .TestSetup
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂) #, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂) #, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred CUDA.zeros(T, W)
+                CUDA.@allowscalar begin
+                    @test @constinferred(hash(t)) == hash(deepcopy(t))
+                end
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) == TensorMap{T, spacetype(t), 5, 0, CuVector{T, CUDA.DeviceMemory}}
+                # blocks
+                bs = @constinferred blocks(t)
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t, first(blocksectors(t)))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t)
+                @test typeof(c) === sectortype(t)
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(CuTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I) || I == Trivial
+            @timedtestset "Tensor Array conversion" begin
+                W1 = V1 ← one(V1)
+                W2 = one(V2) ← V2
+                W3 = V1 ⊗ V2 ← one(V1)
+                W4 = V1 ← V2
+                W5 = one(V1) ← V1 ⊗ V2
+                W6 = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for W in (W1, W2, W3, W4, W5, W6)
+                    for T in (Int, Float32, ComplexF64)
+                        if T == Int
+                            t = CuTensorMap{T}(undef, W)
+                            for (_, b) in blocks(t)
+                                CUDA.@allowscalar CUDA.rand!(b, -20:20)
+                            end
+                        else
+                            t = @constinferred CUDA.randn(T, W)
+                        end
+                        #=CUDA.@allowscalar begin
+                            a = @constinferred convert(CuArray, t)
+                        end
+                        b = reshape(a, dim(codomain(W)), dim(domain(W)))
+                        @test t ≈ @constinferred CuTensorMap(a, W)
+                        @test t ≈ @constinferred CuTensorMap(b, W)
+                        @test t === @constinferred CuTensorMap(t.data, W)=# # TODO convert(CuArray broken
+                    end
+                end
+                for T in (Int, Float32, ComplexF64)
+                    t = CUDA.randn(T, V1 ⊗ V2 ← zero(V1))
+                    CUDA.@allowscalar begin
+                        a = convert(CuArray, t)
+                    end
+                    @test norm(a) == 0
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                # blocks for adjoint
+                bs = @constinferred blocks(t')
+                (c, b1), state = @constinferred Nothing iterate(bs)
+                @test c == first(blocksectors(W'))
+                next = @constinferred Nothing iterate(bs, state)
+                b2 = @constinferred block(t', first(blocksectors(t')))
+                @test b1 == b2
+                @test_broken eltype(bs) === Pair{typeof(c), typeof(b1)}
+                @test_broken typeof(b1) === TensorKit.blocktype(t')
+                @test typeof(c) === sectortype(t)
+                # linear algebra
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                CUDA.@allowscalar begin
+                    @test norm(t + t, p) ≈ 2 * norm(t, p)
+                end
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                #@test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2)) # broken for Irrep[CU₁]
+                @test dot(β * t2, α * t) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                i1 = @constinferred(isomorphism(T, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuVector{T}, V2 ⊗ V1, V1 ⊗ V2))
+                CUDA.@allowscalar begin
+                    #@test i1 * i2 == @constinferred(id(T, V1 ⊗ V2)) # TODO
+                    #@test i2 * i1 == @constinferred(id(CuVector{T}, V2 ⊗ V1)) # TODO
+                    w = @constinferred(isometry(T, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)), V1))
+                    @test dim(w) == 2 * dim(V1 ← V1)
+                    @test w' * w == id(CuVector{T}, V1)
+                    @test w * w' == (w * w')^2
+                end
+            end
+        end
+        @timedtestset "Trivial space insertion and removal" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                t2 = @constinferred insertleftunit(t)
+                @test t2 == @constinferred insertrightunit(t)
+                @test numind(t2) == numind(t) + 1
+                @test space(t2) == insertleftunit(space(t))
+                @test scalartype(t2) === T
+                @test t.data === t2.data
+                @test @constinferred(removeunit(t2, $(numind(t2)))) == t
+                t3 = @constinferred insertleftunit(t; copy = true)
+                @test t3 == @constinferred insertrightunit(t; copy = true)
+                @test t.data !== t3.data
+                for (c, b) in blocks(t)
+                    @test b == block(t3, c)
+                end
+                @test @constinferred(removeunit(t3, $(numind(t3)))) == t
+                t4 = @constinferred insertrightunit(t, 3; dual = true)
+                @test numin(t4) == numin(t) && numout(t4) == numout(t) + 1
+                for (c, b) in blocks(t)
+                    @test b == block(t4, c)
+                end
+                @test @constinferred(removeunit(t4, 4)) == t
+                t5 = @constinferred insertleftunit(t, 4; dual = true)
+                @test numin(t5) == numin(t) + 1 && numout(t5) == numout(t)
+                for (c, b) in blocks(t)
+                    @test b == block(t5, c)
+                end
+                @test @constinferred(removeunit(t5, 4)) == t
+            end
+        end
+        if hasfusiontensor(I)
+            #=@timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = CUDA.rand(T, W)
+                    t2 = @constinferred CUDA.rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(convert(CuArray, t), 2)
+                    @test dot(t2, t) ≈ dot(convert(CuArray, t2), convert(CuArray, t))
+                    α = rand(T)
+                    @test convert(CuArray, α * t) ≈ α * convert(CuArray, t)
+                    @test convert(CuArray, t + t) ≈ 2 * convert(CuArray, t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred CUDA.randn(T, W, W)
+
+                    tr = @constinferred real(t)
+                    @test scalartype(tr) <: Real
+                    @test real(convert(CuArray, t)) == convert(CuArray, tr)
+
+                    ti = @constinferred imag(t)
+                    @test scalartype(ti) <: Real
+                    @test imag(convert(CuArray, t)) == convert(CuArray, ti)
+
+                    tc = @inferred complex(t)
+                    @test scalartype(tc) <: Complex
+                    @test complex(convert(CuArray, t)) == convert(CuArray, tc)
+
+                    tc2 = @inferred complex(tr, ti)
+                    @test tc2 ≈ tc
+                end
+            end=# # TODO convert(CuArray is broken
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred CUDA.randn(W ← W)
+            @test typeof(convert(CuTensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = CUDA.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use CUDA here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = CUDA.rand(ComplexF64, W)
+            t′ = CUDA.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    CUDA.@allowscalar begin
+                        t2 = @constinferred permute(t, (p1, p2))
+                        t2 = permute(t, (p1, p2))
+                        @test norm(t2) ≈ norm(t)
+                        t2′ = permute(t′, (p1, p2))
+                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                    end
+                end
+
+                CUDA.@allowscalar begin
+                    t3 = @constinferred repartition(t, $k)
+                    t3 = repartition(t, k)
+                    @test norm(t3) ≈ norm(t)
+                    t3′ = @constinferred repartition!(similar(t3), t′)
+                    @test norm(t3′) ≈ norm(t′)
+                    @test dot(t′, t) ≈ dot(t3′, t3)
+                end
+            end
+        end
+        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = CUDA.rand(ComplexF64, W)
+                a = convert(CuArray, t)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = permute(t, (p1, p2))
+                        a2 = convert(CuArray, t2)
+                        @test a2 ≈ permutedims(a, (p1..., p2...))
+                        @test convert(CuArray, transpose(t2)) ≈
+                            permutedims(a2, (5, 4, 3, 2, 1))
+                    end
+
+                    t3 = repartition(t, k)
+                    a3 = convert(CuArray, t3)
+                    @test a3 ≈ permutedims(
+                        a, (ntuple(identity, k)..., reverse(ntuple(i -> i + k, 5 - k))...)
+                    )
+                end
+            end
+        end=# # convert(CuArray broken
+        @timedtestset "Full trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            CUDA.@allowscalar begin
+                t2 = permute(t, ((1, 2), (4, 3)))
+                s = @constinferred tr(t2)
+                @test conj(s) ≈ tr(t2')
+                if !isdual(V1)
+                    t2 = twist!(t2, 1)
+                end
+                if isdual(V2)
+                    t2 = twist!(t2, 2)
+                end
+                ss = tr(t2)
+                @tensor s2 = t[a, b, b, a]
+                @tensor t3[a, b] := t[a, c, c, b]
+                @tensor s3 = t3[a, a]
+            end
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                CUDA.@allowscalar begin
+                    @tensor t2[a, b] := t[c, d, b, d, c, a]
+                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                end
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            CUDA.@allowscalar begin
+                t3 = t1 ⊗ t2
+                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+                @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            end
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = CUDA.randn(ComplexF64, V1' * V2', V3')
+                A2 = CUDA.randn(ComplexF64, V3 * V4, V5)
+                rhoL = CUDA.randn(ComplexF64, V1, V1)
+                rhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                CUDA.@allowscalar begin
+                    @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                        A2[b, t2, c'] * rhoR[c', c] *
+                        H[s1, s2, t1, t2]
+
+                    @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                        conj(ad(A1)[a', t1, b]) *
+                        ad(A2)[b, t2, c'] *
+                        ad(rhoR)[c', c] *
+                        ad(H)[s1, s2, t1, t2]
+                end
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Index flipping: test flipping inverse" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            for i in 1:4
+                CUDA.@allowscalar begin
+                    @test t ≈ flip(flip(t, i), i; inv = true)
+                    @test t ≈ flip(flip(t, i; inv = true), i)
+                end
+            end
+        end
+        #=@timedtestset "Index flipping: test via explicit flip" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
+            F1 = unitary(flip(V1), V1)
+
+            CUDA.@allowscalar begin
+                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+                @test flip(t, 1) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+                @test twist!(flip(t, 2), 2) ≈ tf
+                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+                @test flip(t, 3) ≈ tf
+                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+                @test twist!(flip(t, 4), 4) ≈ tf
+            end
+        end=# # TODO
+        @timedtestset "Index flipping: test via contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
+            CUDA.@allowscalar begin
+                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+                @test ta ≈ tb
+                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+                @test flip(ta, (1, 2)) ≈ tb
+            end
+        end
+        @timedtestset "Multiplication of isometries: test properties" begin
+            W2 = V4 ⊗ V5
+            W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
+            for T in (Float64, ComplexF64)
+                t1 = randisometry(T, CuMatrix{T}, W1, W2)
+                t2 = randisometry(T, CuMatrix{T}, W2 ← W2)
+                @test isisometric(t1)
+                @test isunitary(t2)
+                P = t1 * t1'
+                @test P * P ≈ P
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t = CUDA.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = CUDA.rand(T, W1, W1)
+                    t2 = CUDA.rand(T, W2, W2)
+                    t = CUDA.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    At1 = reshape(convert(CuArray, t1), d1, d1)
+                    At2 = reshape(convert(CuArray, t2), d2, d2)
+                    At = reshape(convert(CuArray, t), d1, d2)
+                    @test reshape(convert(CuArray, t1 * t), d1, d2) ≈ At1 * At
+                    @test reshape(convert(CuArray, t1' * t), d1, d2) ≈ At1' * At
+                    @test reshape(convert(CuArray, t2 * t'), d2, d1) ≈ At2 * At'
+                    @test reshape(convert(CuArray, t2' * t'), d2, d1) ≈ At2' * At'
+
+                    @test reshape(convert(CuArray, inv(t1)), d1, d1) ≈ inv(At1)
+                    @test reshape(convert(CuArray, pinv(t)), d2, d1) ≈ pinv(At)
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test reshape(convert(CuArray, t1 \ t), d1, d2) ≈ At1 \ At
+                    @test reshape(convert(CuArray, t1' \ t), d1, d2) ≈ At1' \ At
+                    @test reshape(convert(CuArray, t2 \ t'), d2, d1) ≈ At2 \ At'
+                    @test reshape(convert(CuArray, t2' \ t'), d2, d1) ≈ At2' \ At'
+
+                    @test reshape(convert(CuArray, t2 / t), d2, d1) ≈ At2 / At
+                    @test reshape(convert(CuArray, t2' / t), d2, d1) ≈ At2' / At
+                    @test reshape(convert(CuArray, t1 / t'), d1, d2) ≈ At1 / At'
+                    @test reshape(convert(CuArray, t1' / t'), d1, d2) ≈ At1' / At'
+                end
+            end
+        end=# # convert(CuArray broken
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = CUDA.randn(T, W, W)
+                    s = dim(W)
+                    CUDA.@allowscalar begin
+                        #=@test (@constinferred sqrt(t))^2 ≈ t
+                        @test ad(sqrt(t)) ≈ sqrt(ad(t))
+                        
+                        expt = @constinferred exp(t)
+                        @test ad(expt) ≈ exp(ad(t))
+
+                        @test exp(@constinferred log(expt)) ≈ expt
+                        @test ad(log(expt)) ≈ log(ad(expt))
+
+                        @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                              id(storagetype(t), W)
+                        @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                        @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                        @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                              id(storagetype(t), W)
+                        @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                        @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                        t1 = sin(t)
+                        @test sin(@constinferred asin(t1)) ≈ t1
+                        t2 = cos(t)
+                        @test cos(@constinferred acos(t2)) ≈ t2
+                        t3 = sinh(t)
+                        @test sinh(@constinferred asinh(t3)) ≈ t3
+                        t4 = cosh(t)
+                        @test cosh(@constinferred acosh(t4)) ≈ t4
+                        t5 = tan(t)
+                        @test tan(@constinferred atan(t5)) ≈ t5
+                        t6 = cot(t)
+                        @test cot(@constinferred acot(t6)) ≈ t6
+                        t7 = tanh(t)
+                        @test tanh(@constinferred atanh(t7)) ≈ t7
+                        t8 = coth(t)
+                        @test coth(@constinferred acoth(t8)) ≈ t8
+                        =# #exp not supported for CuArray
+                    end
+                end
+            end
+        end
+        # Sylvester not defined for CUDA
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                end
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    CUDA.@allowscalar begin
+                        t = @constinferred (t1 ⊗ t2)
+                        At = ad(t)
+                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                    end
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
+                CUDA.@allowscalar begin
+                    t = @constinferred (t1 ⊗ t2)
+                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                    # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
+                end
+            end
+        end
+    end
+    TensorKit.empty_globalcaches!()
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = CUDA.rand(T, W2, W1 ⊗ W1')
+            CUDA.@allowscalar begin
+                t = @constinferred (t1 ⊠ t2)
+            end
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            CUDA.@allowscalar begin
+                @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 854991a7f..2c63a8dde 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,8 +2,6 @@
 # ------------
 using ArgParse: ArgParse
 using SafeTestsets: @safetestset
-# using CUDA: CUDA
-# using AMDGPU: AMDGPU
 
 function parse_commandline(args = ARGS)
     s = ArgParse.ArgParseSettings()
@@ -49,9 +47,15 @@ istestfile(fn) = endswith(fn, ".jl") && !contains(fn, "setup")
 
     # handle GPU cases separately
     if group == "cuda"
-        # CUDA.functional() || continue
+        using CUDA
+        CUDA.functional() || continue
+        @time include("cuda/tensors.jl")
+        @time include("cuda/factorizations.jl")
     elseif group == "amd"
-        # AMDGPU.functional() || continue
+        using AMDGPU
+        AMDGPU.functional() || continue
+        #@time include("amd/tensors.jl")
+        @time include("amd/factorizations.jl")
     elseif is_buildkite
         continue
     end