From aebbe3618c84ec2f9f2e1c5627ce3eaebdfcce45 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 25 Jun 2025 09:01:42 -0400 Subject: [PATCH 01/31] Change Treetransformer structures Rewrite treetransformers Introduce `StridedStructure` --- src/spaces/homspace.jl | 6 +- src/tensors/indexmanipulations.jl | 68 ++++++++++++---- src/tensors/treetransformers.jl | 129 +++++++++++++++++++++++++----- 3 files changed, 170 insertions(+), 33 deletions(-) diff --git a/src/spaces/homspace.jl b/src/spaces/homspace.jl index bdb534663..b2ac5f894 100644 --- a/src/spaces/homspace.jl +++ b/src/spaces/homspace.jl @@ -225,11 +225,15 @@ end # Block and fusion tree ranges: structure information for building tensors #-------------------------------------------------------------------------- + +# sizes, strides, offset +const StridedStructure{N} = Tuple{NTuple{N,Int},NTuple{N,Int},Int} + struct FusionBlockStructure{I,N,F₁,F₂} totaldim::Int blockstructure::SectorDict{I,Tuple{Tuple{Int,Int},UnitRange{Int}}} fusiontreelist::Vector{Tuple{F₁,F₂}} - fusiontreestructure::Vector{Tuple{NTuple{N,Int},NTuple{N,Int},Int}} + fusiontreestructure::Vector{StridedStructure{N}} fusiontreeindices::FusionTreeDict{Tuple{F₁,F₂},Int} end diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 24478e3f7..c5840ed3a 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -485,23 +485,16 @@ end function add_transform_kernel!(tdst::TensorMap, tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, + p::Index2Tuple, transformer::AbelianTreeTransformer, α::Number, β::Number, backend::AbstractBackend...) - structure_dst = transformer.structure_dst.fusiontreestructure - structure_src = transformer.structure_src.fusiontreestructure - # TODO: this could be multithreaded - for (row, col, val) in zip(transformer.rows, transformer.cols, transformer.vals) - sz_dst, str_dst, offset_dst = structure_dst[col] - subblock_dst = StridedView(tdst.data, sz_dst, str_dst, offset_dst) - - sz_src, str_src, offset_src = structure_src[row] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) - - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * val, β, backend...) + for (stridestructure_dst, stridestructure_src, coeff) in transformer.data + subblock_dst = StridedView(tdst.data, stridestructure_dst...) + subblock_src = StridedView(tsrc.data, stridestructure_src...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) end return nothing @@ -514,8 +507,8 @@ function add_transform_kernel!(tdst::TensorMap, α::Number, β::Number, backend::AbstractBackend...) - structure_dst = transformer.structure_dst.fusiontreestructure - structure_src = transformer.structure_src.fusiontreestructure + structure_dst = transformer.structure_dst + structure_src = transformer.structure_src rows = rowvals(transformer.matrix) vals = nonzeros(transformer.matrix) @@ -545,6 +538,53 @@ function add_transform_kernel!(tdst::TensorMap, return tdst end +function add_transform_kernel!(tdst::TensorMap, + tsrc::TensorMap, + p::Index2Tuple, + transformer::OuterTreeTransformer, + α::Number, + β::Number, + backend::AbstractBackend...) + # preallocate buffers - for now overshooting the size by assuming maximal value + buffer1 = similar(tsrc.data) + buffer2 = similar(buffer1) + + # TODO: this could be multithreaded + for (basistransform, structures_dst, structures_src) in transformer.data + # Special case without intermediate buffers whenever there is only a single block + if length(basistransform) == 1 + subblock_dst = StridedView(tdst.data, only(structures_dst)...) + subblock_src = StridedView(tsrc.data, only(structures_src)...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * only(basistransform), β, + backend...) + continue + end + + rows, cols = size(basistransform) + sz_src = first(first(structures_src)) + blocksize = prod(sz_src) + + # Filling up a buffer with contiguous data + buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 1) + for (i, structure_src) in enumerate(structures_src) + subblock_src = StridedView(tsrc.data, structure_src...) + copyto!(@view(buffer_src[:, i]), subblock_src) + end + + # Resummation into a second buffer using BLAS + buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 1) + mul!(buffer_dst, buffer_src, basistransform) + + # Filling up the output + for (i, structure_dst) in enumerate(structures_dst) + subblock_dst = StridedView(tdst.data, structure_dst...) + bufblock_dst = sreshape(@view(buffer_dst[:, i]), sz_src) + TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β) + end + end + return tdst +end + function add_transform_kernel!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 053c5f3ba..048377f13 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -7,18 +7,36 @@ abstract type TreeTransformer end struct TrivialTreeTransformer <: TreeTransformer end -struct AbelianTreeTransformer{T,I,N,F1,F2,F3,F4} <: TreeTransformer - rows::Vector{Int} - cols::Vector{Int} - vals::Vector{T} - structure_dst::FusionBlockStructure{I,N,F1,F2} - structure_src::FusionBlockStructure{I,N,F3,F4} +struct AbelianTreeTransformer{T,N} <: TreeTransformer + data::Vector{Tuple{StridedStructure{N},StridedStructure{N},T}} end -struct GenericTreeTransformer{T,I,N,F1,F2,F3,F4} <: TreeTransformer +function AbelianTreeTransformer(transform, p, Vsrc, Vdst) + permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) + structure_dst = fusionblockstructure(Vdst) + structure_src = fusionblockstructure(Vsrc) + + L = length(structure_src.fusiontreelist) + T = sectorscalartype(sectortype(Vdst)) + N = numind(Vsrc) + data = Vector{Tuple{StridedStructure{N},StridedStructure{N},T}}(undef, L) + + for i in 1:L + f₁, f₂ = structure_src.fusiontreelist[i] + (f₃, f₄), coeff = only(transform(f₁, f₂)) + j = structure_dst.fusiontreeindices[(f₃, f₄)] + stridestructure_dst = structure_dst.fusiontreestructure[j] + stridestructure_src = structure_src.fusiontreestructure[i] + data[i] = (stridestructure_dst, stridestructure_src, coeff) + end + + return AbelianTreeTransformer(data) +end + +struct GenericTreeTransformer{T,N} <: TreeTransformer matrix::SparseMatrixCSC{T,Int} - structure_dst::FusionBlockStructure{I,N,F1,F2} - structure_src::FusionBlockStructure{I,N,F3,F4} + structure_dst::Vector{StridedStructure{N}} + structure_src::Vector{StridedStructure{N}} end function treetransformertype(Vdst, Vsrc) @@ -38,8 +56,8 @@ function treetransformertype(Vdst, Vsrc) end end -function TreeTransformer(transform::Function, Vdst::HomSpace{S}, - Vsrc::HomSpace{S}) where {S} +function TreeTransformer(transform::Function, Vsrc::HomSpace{S}, + Vdst::HomSpace{S}) where {S} I = sectortype(Vdst) I === Trivial && return TrivialTreeTransformer() @@ -58,14 +76,89 @@ function TreeTransformer(transform::Function, Vdst::HomSpace{S}, push!(vals, coeff) end end + ldst = length(structure_dst.fusiontreelist) + lsrc = length(structure_src.fusiontreelist) + matrix = sparse(rows, cols, vals, ldst, lsrc) - if FusionStyle(I) isa UniqueFusion - return AbelianTreeTransformer(rows, cols, vals, structure_dst, structure_src) + return GenericTreeTransformer(matrix, + structure_dst.fusiontreestructure, + structure_src.fusiontreestructure) +end + +struct OuterTreeTransformer{T,N} <: TreeTransformer + data::Vector{Tuple{Matrix{T},Vector{StridedStructure{N}},Vector{StridedStructure{N}}}} +end + +function OuterTreeTransformer(transform, p, Vsrc, Vdst) + permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) + structure_dst = fusionblockstructure(Vdst) + structure_src = fusionblockstructure(Vsrc) + I = sectortype(Vsrc) + + uncoupleds_src = map(structure_src.fusiontreelist) do (f₁, f₂) + return TupleTools.vcat(f₁.uncoupled, f₂.uncoupled) + end + uncoupleds_src_unique = unique(uncoupleds_src) + + uncoupleds_dst = map(structure_dst.fusiontreelist) do (f₁, f₂) + return TupleTools.vcat(f₁.uncoupled, f₂.uncoupled) + end + + outer_data = map(uncoupleds_src_unique) do uncoupled + ids_src = findall(==(uncoupled), uncoupleds_src) + fusiontrees_outer_src = structure_src.fusiontreelist[ids_src] + + uncoupled_dst = TupleTools.getindices(uncoupled, (p[1]..., p[2]...)) + ids_dst = findall(==(uncoupled_dst), uncoupleds_dst) + + fusiontrees_outer_dst = structure_dst.fusiontreelist[ids_dst] + + matrix = zeros(sectorscalartype(I), length(ids_dst), length(ids_src)) + for (col, (f₁, f₂)) in enumerate(fusiontrees_outer_src) + for ((f₃, f₄), coeff) in transform(f₁, f₂) + row = findfirst(==((f₃, f₄)), fusiontrees_outer_dst)::Int + matrix[row, col] = coeff + end + end + + return (matrix, + structure_dst.fusiontreestructure[ids_dst], + structure_src.fusiontreestructure[ids_src]) + end + return OuterTreeTransformer(outer_data) +end + +useouter() = true + +function treetransformertype(Vdst, Vsrc) + I = sectortype(Vdst) + I === Trivial && return TrivialTreeTransformer + + T = sectorscalartype(I) + N = numind(Vdst) + if useouter() + return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : + OuterTreeTransformer{T,N} + else + return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : + GenericTreeTransformer{T,N} + end +end + +function TreeTransformer(transform::Function, p, Vsrc::HomSpace{S}, + Vdst::HomSpace{S}) where {S} + permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) + + I = sectortype(Vdst) + I === Trivial && return TrivialTreeTransformer() + + FusionStyle(I) == UniqueFusion() && + return AbelianTreeTransformer(transform, p, Vsrc, Vdst) + + if useouter() + return OuterTreeTransformer(transform, p, Vsrc, Vdst) else - ldst = length(structure_dst.fusiontreelist) - lsrc = length(structure_src.fusiontreelist) - matrix = sparse(rows, cols, vals, ldst, lsrc) - return GenericTreeTransformer(matrix, structure_dst, structure_src) + return GenericTreeTransformer(transform, p, Vsrc, Vdst) end end @@ -94,7 +187,7 @@ for (transform, treetransformer) in @cached function $treetransformer(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)::treetransformertype(Vdst, Vsrc) fusiontreetransform(f1, f2) = $transform(f1, f2, p...) - return TreeTransformer(fusiontreetransform, Vdst, Vsrc) + return TreeTransformer(fusiontreetransform, Vsrc, Vdst) end end end From cbd0c73686a23cbc3e80adafda3411aa787ae710 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 25 Jun 2025 19:59:22 -0400 Subject: [PATCH 02/31] small fixes --- src/tensors/treetransformers.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 048377f13..3695b972d 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -114,9 +114,9 @@ function OuterTreeTransformer(transform, p, Vsrc, Vdst) fusiontrees_outer_dst = structure_dst.fusiontreelist[ids_dst] matrix = zeros(sectorscalartype(I), length(ids_dst), length(ids_src)) - for (col, (f₁, f₂)) in enumerate(fusiontrees_outer_src) + for (row, (f₁, f₂)) in enumerate(fusiontrees_outer_src) for ((f₃, f₄), coeff) in transform(f₁, f₂) - row = findfirst(==((f₃, f₄)), fusiontrees_outer_dst)::Int + col = findfirst(==((f₃, f₄)), fusiontrees_outer_dst)::Int matrix[row, col] = coeff end end @@ -172,7 +172,7 @@ end @cached function treebraider(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple, levels)::treetransformertype(Vdst, Vsrc) fusiontreebraider(f1, f2) = braid(f1, f2, levels..., p...) - return TreeTransformer(fusiontreebraider, Vdst, Vsrc) + return TreeTransformer(fusiontreebraider, p, Vdst, Vsrc) end for (transform, treetransformer) in @@ -187,7 +187,7 @@ for (transform, treetransformer) in @cached function $treetransformer(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)::treetransformertype(Vdst, Vsrc) fusiontreetransform(f1, f2) = $transform(f1, f2, p...) - return TreeTransformer(fusiontreetransform, Vsrc, Vdst) + return TreeTransformer(fusiontreetransform, p, Vsrc, Vdst) end end end From 1b95c4ad50ea76310d316946e3a4c8e8fe38d520 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 25 Jun 2025 20:05:39 -0400 Subject: [PATCH 03/31] fix overestimating the sizes of the buffers --- src/tensors/indexmanipulations.jl | 10 ++++++---- src/tensors/treetransformers.jl | 9 +++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index c5840ed3a..305abde0d 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -546,8 +546,8 @@ function add_transform_kernel!(tdst::TensorMap, β::Number, backend::AbstractBackend...) # preallocate buffers - for now overshooting the size by assuming maximal value - buffer1 = similar(tsrc.data) - buffer2 = similar(buffer1) + buffer1 = eltype(tsrc.data)[] + buffer2 = eltype(tsrc.data)[] # TODO: this could be multithreaded for (basistransform, structures_dst, structures_src) in transformer.data @@ -565,14 +565,16 @@ function add_transform_kernel!(tdst::TensorMap, blocksize = prod(sz_src) # Filling up a buffer with contiguous data - buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 1) + resize!(buffer1, blocksize * cols) # ensure large enough + buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 0) for (i, structure_src) in enumerate(structures_src) subblock_src = StridedView(tsrc.data, structure_src...) copyto!(@view(buffer_src[:, i]), subblock_src) end # Resummation into a second buffer using BLAS - buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 1) + resize!(buffer2, blocksize * rows) # ensure large enough + buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) mul!(buffer_dst, buffer_src, basistransform) # Filling up the output diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 3695b972d..110f75214 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -125,9 +125,18 @@ function OuterTreeTransformer(transform, p, Vsrc, Vdst) structure_dst.fusiontreestructure[ids_dst], structure_src.fusiontreestructure[ids_src]) end + + # sort by (approximate) weight to make the buffers happy + # and use round-robin strategy for multi-threading + sort!(outer_data; by=_transformer_weight, rev=true) + return OuterTreeTransformer(outer_data) end +function _transformer_weight((matrix, structures_dst, structures_src)) + return size(matrix, 1) * prod(structures_dst[1][1]) +end + useouter() = true function treetransformertype(Vdst, Vsrc) From 7b064f3f504855864edfa3351c3347e1dc9de825 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 13:59:32 +0200 Subject: [PATCH 04/31] Add some debug statements --- src/tensors/treetransformers.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 110f75214..9276b95cf 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -120,6 +120,8 @@ function OuterTreeTransformer(transform, p, Vsrc, Vdst) matrix[row, col] = coeff end end + @debug("Created recoupling block for uncoupled: $uncoupled", + sz = size(matrix), sparsity = count(!iszero, matrix) / length(matrix)) return (matrix, structure_dst.fusiontreestructure[ids_dst], @@ -130,6 +132,11 @@ function OuterTreeTransformer(transform, p, Vsrc, Vdst) # and use round-robin strategy for multi-threading sort!(outer_data; by=_transformer_weight, rev=true) + @debug("TreeTransformer for $Vsrc to $Vdst via $p", + nblocks = length(outer_data), + sz_median = size(outer_data[end ÷ 2][1], 1), + sz_max = size(outer_data[1][1], 1)) + return OuterTreeTransformer(outer_data) end From bf90fe5e3f4f103355ab9000a8199d0645754253 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:06:00 +0200 Subject: [PATCH 05/31] avoid resizing buffers --- src/tensors/indexmanipulations.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 305abde0d..c269c0209 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -545,9 +545,11 @@ function add_transform_kernel!(tdst::TensorMap, α::Number, β::Number, backend::AbstractBackend...) - # preallocate buffers - for now overshooting the size by assuming maximal value - buffer1 = eltype(tsrc.data)[] - buffer2 = eltype(tsrc.data)[] + # preallocate buffers + basistransform, structures_dst, _ = first(transformer.data) + buffersize = size(basistransform, 1) * prod(structures_dst[1][1]) + buffer1 = similar(tsrc.data, buffersize) + buffer2 = similar(tdst.data, buffersize) # TODO: this could be multithreaded for (basistransform, structures_dst, structures_src) in transformer.data @@ -565,7 +567,6 @@ function add_transform_kernel!(tdst::TensorMap, blocksize = prod(sz_src) # Filling up a buffer with contiguous data - resize!(buffer1, blocksize * cols) # ensure large enough buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 0) for (i, structure_src) in enumerate(structures_src) subblock_src = StridedView(tsrc.data, structure_src...) @@ -573,7 +574,6 @@ function add_transform_kernel!(tdst::TensorMap, end # Resummation into a second buffer using BLAS - resize!(buffer2, blocksize * rows) # ensure large enough buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) mul!(buffer_dst, buffer_src, basistransform) @@ -581,7 +581,7 @@ function add_transform_kernel!(tdst::TensorMap, for (i, structure_dst) in enumerate(structures_dst) subblock_dst = StridedView(tdst.data, structure_dst...) bufblock_dst = sreshape(@view(buffer_dst[:, i]), sz_src) - TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β) + TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) end end return tdst From 0651c683551650b7eeea762ae39672e733ccc7bc Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:11:04 +0200 Subject: [PATCH 06/31] correctly compute size of buffers --- src/tensors/indexmanipulations.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index c269c0209..c47b9af93 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -546,8 +546,9 @@ function add_transform_kernel!(tdst::TensorMap, β::Number, backend::AbstractBackend...) # preallocate buffers - basistransform, structures_dst, _ = first(transformer.data) - buffersize = size(basistransform, 1) * prod(structures_dst[1][1]) + buffersize = maximum(transformer.data) do (_, structures_dst, _) + return prod(structures_dst[1][1]) + end buffer1 = similar(tsrc.data, buffersize) buffer2 = similar(tdst.data, buffersize) From a211b1b38fbb17095ad27d7fd8e8a1339e5b57a0 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:15:46 +0200 Subject: [PATCH 07/31] Remove previous implementation --- src/tensors/indexmanipulations.jl | 40 +--------------- src/tensors/treetransformers.jl | 78 +++---------------------------- 2 files changed, 8 insertions(+), 110 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index c47b9af93..538191534 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -500,48 +500,10 @@ function add_transform_kernel!(tdst::TensorMap, return nothing end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, - transformer::GenericTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - structure_dst = transformer.structure_dst - structure_src = transformer.structure_src - - rows = rowvals(transformer.matrix) - vals = nonzeros(transformer.matrix) - - # TODO: this could be multithreaded - for j in axes(transformer.matrix, 2) - sz_dst, str_dst, offset_dst = structure_dst[j] - subblock_dst = StridedView(tdst.data, sz_dst, str_dst, offset_dst) - nzrows = nzrange(transformer.matrix, j) - - # treat first entry - sz_src, str_src, offset_src = structure_src[rows[first(nzrows)]] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * vals[first(nzrows)], - β, - backend...) - - # treat remaining entries - for i in @view(nzrows[2:end]) - sz_src, str_src, offset_src = structure_src[rows[i]] - subblock_src = StridedView(tsrc.data, sz_src, str_src, offset_src) - TO.tensoradd!(subblock_dst, subblock_src, (p₁, p₂), false, α * vals[i], One(), - backend...) - end - end - - return tdst -end - function add_transform_kernel!(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, - transformer::OuterTreeTransformer, + transformer::GenericTreeTransformer, α::Number, β::Number, backend::AbstractBackend...) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 9276b95cf..73e0b2d83 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -34,62 +34,10 @@ function AbelianTreeTransformer(transform, p, Vsrc, Vdst) end struct GenericTreeTransformer{T,N} <: TreeTransformer - matrix::SparseMatrixCSC{T,Int} - structure_dst::Vector{StridedStructure{N}} - structure_src::Vector{StridedStructure{N}} -end - -function treetransformertype(Vdst, Vsrc) - I = sectortype(Vdst) - I === Trivial && return TrivialTreeTransformer - - N = numind(Vdst) - F1 = fusiontreetype(I, numout(Vdst)) - F2 = fusiontreetype(I, numin(Vdst)) - F3 = fusiontreetype(I, numout(Vsrc)) - F4 = fusiontreetype(I, numin(Vsrc)) - - if FusionStyle(I) isa UniqueFusion - return AbelianTreeTransformer{sectorscalartype(I),I,N,F1,F2,F3,F4} - else - return GenericTreeTransformer{sectorscalartype(I),I,N,F1,F2,F3,F4} - end -end - -function TreeTransformer(transform::Function, Vsrc::HomSpace{S}, - Vdst::HomSpace{S}) where {S} - I = sectortype(Vdst) - I === Trivial && return TrivialTreeTransformer() - - structure_dst = fusionblockstructure(Vdst) - structure_src = fusionblockstructure(Vsrc) - - rows = Int[] - cols = Int[] - vals = sectorscalartype(sectortype(Vdst))[] - - for (row, (f1, f2)) in enumerate(structure_src.fusiontreelist) - for ((f3, f4), coeff) in transform(f1, f2) - col = structure_dst.fusiontreeindices[(f3, f4)] - push!(rows, row) - push!(cols, col) - push!(vals, coeff) - end - end - ldst = length(structure_dst.fusiontreelist) - lsrc = length(structure_src.fusiontreelist) - matrix = sparse(rows, cols, vals, ldst, lsrc) - - return GenericTreeTransformer(matrix, - structure_dst.fusiontreestructure, - structure_src.fusiontreestructure) -end - -struct OuterTreeTransformer{T,N} <: TreeTransformer data::Vector{Tuple{Matrix{T},Vector{StridedStructure{N}},Vector{StridedStructure{N}}}} end -function OuterTreeTransformer(transform, p, Vsrc, Vdst) +function GenericTreeTransformer(transform, p, Vsrc, Vdst) permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) @@ -137,28 +85,21 @@ function OuterTreeTransformer(transform, p, Vsrc, Vdst) sz_median = size(outer_data[end ÷ 2][1], 1), sz_max = size(outer_data[1][1], 1)) - return OuterTreeTransformer(outer_data) + return GenericTreeTransformer(outer_data) end function _transformer_weight((matrix, structures_dst, structures_src)) return size(matrix, 1) * prod(structures_dst[1][1]) end -useouter() = true - function treetransformertype(Vdst, Vsrc) I = sectortype(Vdst) I === Trivial && return TrivialTreeTransformer T = sectorscalartype(I) N = numind(Vdst) - if useouter() - return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : - OuterTreeTransformer{T,N} - else - return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : - GenericTreeTransformer{T,N} - end + return FusionStyle(I) == UniqueFusion() ? AbelianTreeTransformer{T,N} : + GenericTreeTransformer{T,N} end function TreeTransformer(transform::Function, p, Vsrc::HomSpace{S}, @@ -168,14 +109,9 @@ function TreeTransformer(transform::Function, p, Vsrc::HomSpace{S}, I = sectortype(Vdst) I === Trivial && return TrivialTreeTransformer() - FusionStyle(I) == UniqueFusion() && - return AbelianTreeTransformer(transform, p, Vsrc, Vdst) - - if useouter() - return OuterTreeTransformer(transform, p, Vsrc, Vdst) - else - return GenericTreeTransformer(transform, p, Vsrc, Vdst) - end + return FusionStyle(I) == UniqueFusion() ? + AbelianTreeTransformer(transform, p, Vsrc, Vdst) : + GenericTreeTransformer(transform, p, Vsrc, Vdst) end # braid is special because it has levels From c60960b01b0429cbca5f13bdb2db7521931de9ac Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:30:25 +0200 Subject: [PATCH 08/31] refactor implementation --- src/tensors/indexmanipulations.jl | 67 +++++++++++++++++++------------ 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 538191534..284ad60b2 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -515,39 +515,54 @@ function add_transform_kernel!(tdst::TensorMap, buffer2 = similar(tdst.data, buffersize) # TODO: this could be multithreaded - for (basistransform, structures_dst, structures_src) in transformer.data + for subtransformer in transformer.data # Special case without intermediate buffers whenever there is only a single block - if length(basistransform) == 1 - subblock_dst = StridedView(tdst.data, only(structures_dst)...) - subblock_src = StridedView(tsrc.data, only(structures_src)...) - TO.tensoradd!(subblock_dst, subblock_src, p, false, α * only(basistransform), β, - backend...) - continue + if length(subtransformer[1]) == 1 + _add_transform_single!(tdst, tsrc, p, α, β, subtransformer, backend...) + else + _add_transform_multi!(tdst, tsrc, p, α, β, subtransformer, + (buffer1, buffer2), backend...) end + end + return tdst +end - rows, cols = size(basistransform) - sz_src = first(first(structures_src)) - blocksize = prod(sz_src) +function _add_transform_single!(tdst, tsrc, p, α, β, + (basistransform, structures_dst, structures_src), + backend...) + subblock_dst = StridedView(tdst.data, only(structures_dst)...) + subblock_src = StridedView(tsrc.data, only(structures_src)...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * only(basistransform), β, + backend...) + return nothing +end - # Filling up a buffer with contiguous data - buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 0) - for (i, structure_src) in enumerate(structures_src) - subblock_src = StridedView(tsrc.data, structure_src...) - copyto!(@view(buffer_src[:, i]), subblock_src) - end +function _add_transform_multi!(tdst, tsrc, p, α, β, + (basistransform, structures_dst, structures_src), + (buffer1, buffer2), backend...) + rows, cols = size(basistransform) + sz_src = first(first(structures_src)) + blocksize = prod(sz_src) - # Resummation into a second buffer using BLAS - buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) - mul!(buffer_dst, buffer_src, basistransform) + # Filling up a buffer with contiguous data + buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 0) + for (i, structure_src) in enumerate(structures_src) + subblock_src = StridedView(tsrc.data, structure_src...) + copyto!(@view(buffer_src[:, i]), subblock_src) + end - # Filling up the output - for (i, structure_dst) in enumerate(structures_dst) - subblock_dst = StridedView(tdst.data, structure_dst...) - bufblock_dst = sreshape(@view(buffer_dst[:, i]), sz_src) - TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) - end + # Resummation into a second buffer using BLAS + buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) + mul!(buffer_dst, buffer_src, basistransform, α) + + # Filling up the output + for (i, structure_dst) in enumerate(structures_dst) + subblock_dst = StridedView(tdst.data, structure_dst...) + bufblock_dst = sreshape(@view(buffer_dst[:, i]), sz_src) + TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) end - return tdst + + return nothing end function add_transform_kernel!(tdst::AbstractTensorMap, From 227d7bd97c09dd960a494c8e9e42366185e06a3d Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:35:02 +0200 Subject: [PATCH 09/31] reuse code --- src/tensors/indexmanipulations.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 284ad60b2..99f3ad32a 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -491,10 +491,8 @@ function add_transform_kernel!(tdst::TensorMap, β::Number, backend::AbstractBackend...) # TODO: this could be multithreaded - for (stridestructure_dst, stridestructure_src, coeff) in transformer.data - subblock_dst = StridedView(tdst.data, stridestructure_dst...) - subblock_src = StridedView(tsrc.data, stridestructure_src...) - TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) + for subtransformer in transformer.data + _add_transform_single!(tdst, tsrc, p, α, β, subtransformer, backend...) end return nothing @@ -530,10 +528,12 @@ end function _add_transform_single!(tdst, tsrc, p, α, β, (basistransform, structures_dst, structures_src), backend...) - subblock_dst = StridedView(tdst.data, only(structures_dst)...) - subblock_src = StridedView(tsrc.data, only(structures_src)...) - TO.tensoradd!(subblock_dst, subblock_src, p, false, α * only(basistransform), β, - backend...) + structure_dst = structures_dst isa Vector ? only(structures_dst) : structures_dst + structure_src = structures_src isa Vector ? only(structures_src) : structures_src + coeff = basistransform isa Number ? basistransform : only(basistransform) + subblock_dst = StridedView(tdst.data, structure_dst...) + subblock_src = StridedView(tsrc.data, structure_src...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) return nothing end From ca81bc1488bec315ce184d07f204314b07fd5c73 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 14:37:02 +0200 Subject: [PATCH 10/31] remove SparseArrays dependency --- Project.toml | 2 -- src/TensorKit.jl | 2 -- 2 files changed, 4 deletions(-) diff --git a/Project.toml b/Project.toml index b148cea6a..e7ff1722c 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,6 @@ LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67" TensorKitSectors = "13a9c161-d5da-41f0-bcbd-e1a08ae0647f" TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2" @@ -33,7 +32,6 @@ LRUCache = "1.0.2" LinearAlgebra = "1" PackageExtensionCompat = "1" Random = "1" -SparseArrays = "1" Strided = "2" TensorKitSectors = "0.1" TensorOperations = "5.1" diff --git a/src/TensorKit.jl b/src/TensorKit.jl index d94245530..b338791a3 100644 --- a/src/TensorKit.jl +++ b/src/TensorKit.jl @@ -126,8 +126,6 @@ using LinearAlgebra: norm, dot, normalize, normalize!, tr, isposdef, isposdef!, ishermitian, rank, cond, Diagonal, Hermitian -using SparseArrays: SparseMatrixCSC, sparse, nzrange, rowvals, nonzeros - import Base.Meta using Random: Random, rand!, randn! From b992e70ad56bf8c97fd70d84eff04b5405afb0d2 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 17:07:12 +0200 Subject: [PATCH 11/31] Refactor and add multithreading --- src/TensorKit.jl | 17 +++++ src/tensors/indexmanipulations.jl | 116 +++++++++++++++++++++++------- src/tensors/treetransformers.jl | 6 +- 3 files changed, 112 insertions(+), 27 deletions(-) diff --git a/src/TensorKit.jl b/src/TensorKit.jl index b338791a3..9fa6237fb 100644 --- a/src/TensorKit.jl +++ b/src/TensorKit.jl @@ -183,6 +183,21 @@ include("fusiontrees/fusiontrees.jl") #------------------------------------------- include("spaces/vectorspaces.jl") +# Multithreading settings +#------------------------- +const TRANSFORMER_THREADS = Ref(1) + +get_num_transformer_threads() = TRANSFORMER_THREADS[] + +function set_num_transformer_threads(n::Int) + N = Base.Threads.nthreads() + if n > N + n = N + Strided._set_num_threads_warn(n) + end + return TRANSFORMER_THREADS[] = n +end + # Definitions and methods for tensors #------------------------------------- # general definitions @@ -218,6 +233,8 @@ include("auxiliary/deprecate.jl") # ---------- function __init__() @require_extensions + set_num_transformer_threads(Threads.nthreads()) + return nothing end end diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 99f3ad32a..f513bc8ce 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -473,38 +473,68 @@ function add_transform!(tdst::AbstractTensorMap, return tdst end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - (p₁, p₂)::Index2Tuple, - ::TrivialTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - return TO.tensoradd!(tdst[], tsrc[], (p₁, p₂), false, α, β, backend...) +function use_threaded_transform(t::TensorMap, transformer::TreeTransformer) + # TODO: heuristic for not threading over small tensors + return get_num_transformer_threads() > 1 end function add_transform_kernel!(tdst::TensorMap, tsrc::TensorMap, p::Index2Tuple, - transformer::AbelianTreeTransformer, + transformer::TreeTransformer, α::Number, β::Number, backend::AbstractBackend...) - # TODO: this could be multithreaded + if use_threaded_transform(tsrc, transformer) + _add_transform_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + else + _add_transform_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) + end + + return nothing +end + +# Trivial implementation +# ---------------------- +# Hijack before threading is used +function add_transform_kernel!(tdst::TensorMap, tsrc::TensorMap, (p₁, p₂)::Index2Tuple, + ::TrivialTreeTransformer, + α::Number, β::Number, backend::AbstractBackend...) + TO.tensoradd!(tdst[], tsrc[], (p₁, p₂), false, α, β, backend...) + return nothing +end + +# Abelian implementations +# ----------------------- +function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, + α, β, backend...) for subtransformer in transformer.data - _add_transform_single!(tdst, tsrc, p, α, β, subtransformer, backend...) + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) end + return nothing +end +function _add_transform_threaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, + backend...; ntasks::Int=get_num_transformer_threads()) + nblocks = length(transformer.data) + counter = Threads.Atomic{Int}(1) + Threads.@sync for _ in 1:min(ntasks, nblocks) + Threads.@spawn begin + while true + local_counter = Threads.atomic_add!(counter, 1) + local_counter > nblocks && break + @inbounds subtransformer = transformer.data[local_counter] + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + end + end + end return nothing end -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - p::Index2Tuple, - transformer::GenericTreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) +# Non-abelian implementations +# --------------------------- +function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, + α, β, backend...) # preallocate buffers buffersize = maximum(transformer.data) do (_, structures_dst, _) return prod(structures_dst[1][1]) @@ -522,24 +552,59 @@ function add_transform_kernel!(tdst::TensorMap, (buffer1, buffer2), backend...) end end - return tdst + return nothing end -function _add_transform_single!(tdst, tsrc, p, α, β, +function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, + α, β, backend...; + ntasks::Int=get_num_transformer_threads()) + buffersize = maximum(transformer.data) do (_, structures_dst, _) + return prod(structures_dst[1][1]) + end + nblocks = length(transformer.data) + + counter = Threads.Atomic{Int}(1) + Threads.@sync for _ in 1:min(ntasks, nblocks) + Threads.@spawn begin + # preallocate buffers for each task + buffer1 = similar(tsrc.data, buffersize) + buffer2 = similar(tdst.data, buffersize) + + while true + local_counter = Threads.atomic_add!(counter, 1) + local_counter > nblocks && break + @inbounds subtransformer = transformer.data[local_counter] + if length(subtransformer[1]) == 1 + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) + else + _add_transform_multi!(tdst, tsrc, p, subtransformer, (buffer1, buffer2), + α, β, backend...) + end + end + end + end + + return nothing +end + +# Kernels +# ------- +function _add_transform_single!(tdst, tsrc, p, (basistransform, structures_dst, structures_src), - backend...) + α, β, backend...) structure_dst = structures_dst isa Vector ? only(structures_dst) : structures_dst structure_src = structures_src isa Vector ? only(structures_src) : structures_src coeff = basistransform isa Number ? basistransform : only(basistransform) + subblock_dst = StridedView(tdst.data, structure_dst...) subblock_src = StridedView(tsrc.data, structure_src...) TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) return nothing end -function _add_transform_multi!(tdst, tsrc, p, α, β, +function _add_transform_multi!(tdst, tsrc, p, (basistransform, structures_dst, structures_src), - (buffer1, buffer2), backend...) + (buffer1, buffer2), α, β, backend...) rows, cols = size(basistransform) sz_src = first(first(structures_src)) blocksize = prod(sz_src) @@ -553,7 +618,7 @@ function _add_transform_multi!(tdst, tsrc, p, α, β, # Resummation into a second buffer using BLAS buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) - mul!(buffer_dst, buffer_src, basistransform, α) + mul!(buffer_dst, buffer_src, basistransform, α, Zero()) # Filling up the output for (i, structure_dst) in enumerate(structures_dst) @@ -565,6 +630,9 @@ function _add_transform_multi!(tdst, tsrc, p, α, β, return nothing end +# Other implementations +# --------------------- + function add_transform_kernel!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, (p₁, p₂)::Index2Tuple, diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 73e0b2d83..58d453d91 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -8,7 +8,7 @@ abstract type TreeTransformer end struct TrivialTreeTransformer <: TreeTransformer end struct AbelianTreeTransformer{T,N} <: TreeTransformer - data::Vector{Tuple{StridedStructure{N},StridedStructure{N},T}} + data::Vector{Tuple{T,StridedStructure{N},StridedStructure{N}}} end function AbelianTreeTransformer(transform, p, Vsrc, Vdst) @@ -19,7 +19,7 @@ function AbelianTreeTransformer(transform, p, Vsrc, Vdst) L = length(structure_src.fusiontreelist) T = sectorscalartype(sectortype(Vdst)) N = numind(Vsrc) - data = Vector{Tuple{StridedStructure{N},StridedStructure{N},T}}(undef, L) + data = Vector{Tuple{T,StridedStructure{N},StridedStructure{N}}}(undef, L) for i in 1:L f₁, f₂ = structure_src.fusiontreelist[i] @@ -27,7 +27,7 @@ function AbelianTreeTransformer(transform, p, Vsrc, Vdst) j = structure_dst.fusiontreeindices[(f₃, f₄)] stridestructure_dst = structure_dst.fusiontreestructure[j] stridestructure_src = structure_src.fusiontreestructure[i] - data[i] = (stridestructure_dst, stridestructure_src, coeff) + data[i] = (coeff, stridestructure_dst, stridestructure_src) end return AbelianTreeTransformer(data) From f1a641723509c7951b860604ec576129dda4a58d Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 17:28:05 +0200 Subject: [PATCH 12/31] Refactor buffer allocation --- src/tensors/indexmanipulations.jl | 21 +++++++-------------- src/tensors/treetransformers.jl | 12 ++++++++++++ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index f513bc8ce..30baf9ecf 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -536,11 +536,7 @@ end function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...) # preallocate buffers - buffersize = maximum(transformer.data) do (_, structures_dst, _) - return prod(structures_dst[1][1]) - end - buffer1 = similar(tsrc.data, buffersize) - buffer2 = similar(tdst.data, buffersize) + buffers = allocate_buffers(tdst, tsrc, transformer) # TODO: this could be multithreaded for subtransformer in transformer.data @@ -549,7 +545,7 @@ function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTran _add_transform_single!(tdst, tsrc, p, α, β, subtransformer, backend...) else _add_transform_multi!(tdst, tsrc, p, α, β, subtransformer, - (buffer1, buffer2), backend...) + buffers, backend...) end end return nothing @@ -558,17 +554,14 @@ end function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...; ntasks::Int=get_num_transformer_threads()) - buffersize = maximum(transformer.data) do (_, structures_dst, _) - return prod(structures_dst[1][1]) - end + buffersz = buffersize(transformer) nblocks = length(transformer.data) counter = Threads.Atomic{Int}(1) Threads.@sync for _ in 1:min(ntasks, nblocks) Threads.@spawn begin # preallocate buffers for each task - buffer1 = similar(tsrc.data, buffersize) - buffer2 = similar(tdst.data, buffersize) + buffers = allocate_buffers(tdst, tsrc, transformer) while true local_counter = Threads.atomic_add!(counter, 1) @@ -577,7 +570,7 @@ function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransfo if length(subtransformer[1]) == 1 _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) else - _add_transform_multi!(tdst, tsrc, p, subtransformer, (buffer1, buffer2), + _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...) end end @@ -610,14 +603,14 @@ function _add_transform_multi!(tdst, tsrc, p, blocksize = prod(sz_src) # Filling up a buffer with contiguous data - buffer_src = StridedView(buffer1, (blocksize, cols), (1, blocksize), 0) + buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) for (i, structure_src) in enumerate(structures_src) subblock_src = StridedView(tsrc.data, structure_src...) copyto!(@view(buffer_src[:, i]), subblock_src) end # Resummation into a second buffer using BLAS - buffer_dst = StridedView(buffer2, (blocksize, rows), (1, blocksize), 0) + buffer_dst = StridedView(buffer1, (blocksize, rows), (1, blocksize), 0) mul!(buffer_dst, buffer_src, basistransform, α, Zero()) # Filling up the output diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 58d453d91..288b43f87 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -92,6 +92,18 @@ function _transformer_weight((matrix, structures_dst, structures_src)) return size(matrix, 1) * prod(structures_dst[1][1]) end +function buffersize(transformer::GenericTreeTransformer) + return maximum(transformer.data) do (basistransform, structures_dst, _) + return prod(structures_dst[1][1]) * max(size(basistransform)...) + end +end + +function allocate_buffers(tdst::TensorMap, tsrc::TensorMap, + transformer::GenericTreeTransformer) + sz = buffersize(transformer) + return similar(tdst.data, sz), similar(tsrc.data, sz) +end + function treetransformertype(Vdst, Vsrc) I = sectortype(Vdst) I === Trivial && return TrivialTreeTransformer From 9652268d71746c8704590038bd4b7a5f257f305d Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 20:50:34 +0200 Subject: [PATCH 13/31] Consistent argument order --- src/tensors/treetransformers.jl | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 288b43f87..0af04caed 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -11,7 +11,7 @@ struct AbelianTreeTransformer{T,N} <: TreeTransformer data::Vector{Tuple{T,StridedStructure{N},StridedStructure{N}}} end -function AbelianTreeTransformer(transform, p, Vsrc, Vdst) +function AbelianTreeTransformer(transform, p, Vdst, Vsrc) permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) @@ -37,7 +37,7 @@ struct GenericTreeTransformer{T,N} <: TreeTransformer data::Vector{Tuple{Matrix{T},Vector{StridedStructure{N}},Vector{StridedStructure{N}}}} end -function GenericTreeTransformer(transform, p, Vsrc, Vdst) +function GenericTreeTransformer(transform, p, Vdst, Vsrc) permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) @@ -114,16 +114,17 @@ function treetransformertype(Vdst, Vsrc) GenericTreeTransformer{T,N} end -function TreeTransformer(transform::Function, p, Vsrc::HomSpace{S}, - Vdst::HomSpace{S}) where {S} - permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) +function TreeTransformer(transform::Function, p, Vdst::HomSpace{S}, + Vsrc::HomSpace{S}) where {S} + permute(Vsrc, p) == Vdst || + throw(SpaceMismatch("Incompatible spaces for permuting")) I = sectortype(Vdst) I === Trivial && return TrivialTreeTransformer() return FusionStyle(I) == UniqueFusion() ? - AbelianTreeTransformer(transform, p, Vsrc, Vdst) : - GenericTreeTransformer(transform, p, Vsrc, Vdst) + AbelianTreeTransformer(transform, p, Vdst, Vsrc) : + GenericTreeTransformer(transform, p, Vdst, Vsrc) end # braid is special because it has levels @@ -151,7 +152,7 @@ for (transform, treetransformer) in @cached function $treetransformer(Vdst::TensorMapSpace, Vsrc::TensorMapSpace, p::Index2Tuple)::treetransformertype(Vdst, Vsrc) fusiontreetransform(f1, f2) = $transform(f1, f2, p...) - return TreeTransformer(fusiontreetransform, p, Vsrc, Vdst) + return TreeTransformer(fusiontreetransform, p, Vdst, Vsrc) end end end From 3889d0545d92bdeb59c38a4abf85a8f7f2c88706 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Tue, 1 Jul 2025 20:50:46 +0200 Subject: [PATCH 14/31] Small fixes --- src/tensors/indexmanipulations.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 30baf9ecf..559cb0038 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -542,10 +542,9 @@ function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTran for subtransformer in transformer.data # Special case without intermediate buffers whenever there is only a single block if length(subtransformer[1]) == 1 - _add_transform_single!(tdst, tsrc, p, α, β, subtransformer, backend...) + _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) else - _add_transform_multi!(tdst, tsrc, p, α, β, subtransformer, - buffers, backend...) + _add_transform_multi!(tdst, tsrc, p, subtransformer, buffers, α, β, backend...) end end return nothing @@ -554,7 +553,6 @@ end function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, α, β, backend...; ntasks::Int=get_num_transformer_threads()) - buffersz = buffersize(transformer) nblocks = length(transformer.data) counter = Threads.Atomic{Int}(1) From f5cd7bfe4c6f57b682a9b1a06baad2eb2fdd2a1e Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 2 Jul 2025 11:06:02 +0200 Subject: [PATCH 15/31] improve type stability --- src/tensors/treetransformers.jl | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 0af04caed..50fc7e6d0 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -52,7 +52,14 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) return TupleTools.vcat(f₁.uncoupled, f₂.uncoupled) end - outer_data = map(uncoupleds_src_unique) do uncoupled + T = sectorscalartype(I) + N = numind(Vdst) + L = length(uncoupleds_src_unique) + TStrided = StridedStructure{N} + data = Vector{Tuple{Matrix{T},Vector{TStrided},Vector{TStrided}}}(undef, L) + + # TODO: this can be multithreaded + for (i, uncoupled) in enumerate(uncoupleds_src_unique) ids_src = findall(==(uncoupled), uncoupleds_src) fusiontrees_outer_src = structure_src.fusiontreelist[ids_src] @@ -71,21 +78,21 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) @debug("Created recoupling block for uncoupled: $uncoupled", sz = size(matrix), sparsity = count(!iszero, matrix) / length(matrix)) - return (matrix, - structure_dst.fusiontreestructure[ids_dst], - structure_src.fusiontreestructure[ids_src]) + data[i] = (matrix, + structure_dst.fusiontreestructure[ids_dst], + structure_src.fusiontreestructure[ids_src]) end # sort by (approximate) weight to make the buffers happy # and use round-robin strategy for multi-threading - sort!(outer_data; by=_transformer_weight, rev=true) + sort!(data; by=_transformer_weight, rev=true) @debug("TreeTransformer for $Vsrc to $Vdst via $p", - nblocks = length(outer_data), - sz_median = size(outer_data[end ÷ 2][1], 1), - sz_max = size(outer_data[1][1], 1)) + nblocks = length(data), + sz_median = size(data[end ÷ 2][1], 1), + sz_max = size(data[1][1], 1)) - return GenericTreeTransformer(outer_data) + return GenericTreeTransformer{T,N}(data) end function _transformer_weight((matrix, structures_dst, structures_src)) From 0b2210477787c43299b7ca36df52a5c045efaccd Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 2 Jul 2025 14:51:12 +0200 Subject: [PATCH 16/31] Add multithreading heuristic --- src/tensors/indexmanipulations.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 559cb0038..ec2bab6a4 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -474,8 +474,7 @@ function add_transform!(tdst::AbstractTensorMap, end function use_threaded_transform(t::TensorMap, transformer::TreeTransformer) - # TODO: heuristic for not threading over small tensors - return get_num_transformer_threads() > 1 + return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH end function add_transform_kernel!(tdst::TensorMap, From 00bfd048901d51f654c7634fe10f2b359a3c05e6 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Wed, 2 Jul 2025 14:52:12 +0200 Subject: [PATCH 17/31] Update benchmarks to include non-abelian symmetries --- .../TensorKitBenchmarks/TensorKitBenchmarks.jl | 2 +- .../tensornetworks/TensorNetworkBenchmarks.jl | 9 +++------ .../tensornetworks/benchparams.toml | 18 ++++++++++++++++++ .../TensorKitBenchmarks/utils/BenchUtils.jl | 6 ++++-- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl b/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl index e276c1704..f93fd9996 100644 --- a/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl +++ b/benchmark/TensorKitBenchmarks/TensorKitBenchmarks.jl @@ -4,7 +4,7 @@ using BenchmarkTools using TensorKit using TOML -BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1.0 +BenchmarkTools.DEFAULT_PARAMETERS.seconds = 20.0 BenchmarkTools.DEFAULT_PARAMETERS.samples = 10000 BenchmarkTools.DEFAULT_PARAMETERS.time_tolerance = 0.15 BenchmarkTools.DEFAULT_PARAMETERS.memory_tolerance = 0.01 diff --git a/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl b/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl index becf3935c..4fcd27b22 100644 --- a/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl +++ b/benchmark/TensorKitBenchmarks/tensornetworks/TensorNetworkBenchmarks.jl @@ -45,9 +45,8 @@ function benchmark_mpo!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "mpo") - g = addgroup!(SUITE, "mpo") for params in all_parameters["mpo"] - benchmark_mpo!(g, params) + benchmark_mpo!(SUITE, params) end end @@ -90,9 +89,8 @@ function benchmark_pepo!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "pepo") - g = addgroup!(SUITE, "pepo") for params in all_parameters["pepo"] - benchmark_pepo!(g, params) + benchmark_pepo!(SUITE, params) end end @@ -136,9 +134,8 @@ function benchmark_mera!(bench; sigmas=nothing, T="Float64", I="Trivial", dims) end if haskey(all_parameters, "mera") - g = addgroup!(SUITE, "mera") for params in all_parameters["mera"] - benchmark_mera!(g, params) + benchmark_mera!(SUITE, params) end end diff --git a/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml b/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml index 8d10eb109..da20145cd 100644 --- a/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml +++ b/benchmark/TensorKitBenchmarks/tensornetworks/benchparams.toml @@ -19,6 +19,12 @@ I = "U1Irrep" dims = [[40, 5, 3], [160, 5, 3], [640, 5, 3], [2560, 5, 3], [6120, 5, 3], [200, 20, 20], [400, 20, 20], [400, 40, 40]] sigmas = [0.5, 0.5, 0.5] +[[mpo]] +T = ["Float64"] +I = "SU2Irrep" +dims = [[40, 5, 3], [160, 5, 3], [640, 5, 3], [2560, 5, 3], [6120, 5, 3], [200, 20, 20], [400, 20, 20], [400, 40, 40]] +sigmas = [2, 2, 2] + # PEPO # ---- # dims = [peps, pepo, phys, env] @@ -40,6 +46,12 @@ I = "U1Irrep" dims = [[4, 2, 2, 100], [4, 4, 4, 200], [6, 2, 2, 100], [6, 3, 4, 200], [8, 2, 2, 100], [8, 2, 4, 200], [10, 2, 2, 50], [10, 3, 2, 100]] sigmas = [0.5, 0.5, 0.5, 0.5] +[[pepo]] +T = ["Float64"] +I = "SU2Irrep" +dims = [[4, 2, 2, 100], [4, 4, 4, 200], [6, 2, 2, 100], [6, 3, 4, 200], [8, 2, 2, 100], [8, 2, 4, 200], [10, 2, 2, 50], [10, 3, 2, 100]] +sigmas = [2.0, 2.0, 2.0, 2.0] + # MERA # ---- # dims = mera @@ -60,3 +72,9 @@ T = ["Float64"] I = "U1Irrep" dims = [4, 8, 12, 16, 22, 28] sigmas = [0.5] + +[[mera]] +T = ["Float64"] +I = "SU2Irrep" +dims = [4, 8, 12, 16, 22, 28] +sigmas = [2.0] diff --git a/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl b/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl index cefb02c98..ce22c763a 100644 --- a/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl +++ b/benchmark/TensorKitBenchmarks/utils/BenchUtils.jl @@ -55,13 +55,15 @@ function generate_space(::Type{U1Irrep}, D::Int, sigma::Real=0.5) return U1Space((s => d for (s, d) in zip(sectors, dims))...) end function generate_space(::Type{SU2Irrep}, D::Int, sigma::Real=0.5) - poisson_pdf(x) = ceil(Int, D * exp(-sigma) * sigma^x / factorial(x + 1)) + normal_pdf = let D = D + x -> D * exp(-0.5 * (x / sigma)^2) / (sigma * sqrt(2π)) + end sectors = SU2Irrep[] dims = Int[] for sector in values(SU2Irrep) - d = poisson_pdf(Int(sector.j * 2)) + d = ceil(Int, normal_pdf(sector.j) / dim(sector)) push!(sectors, sector) push!(dims, d) D -= d * dim(sector) From d90034bc276126be32cdf29323f58db846fea1f5 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Thu, 3 Jul 2025 13:40:43 +0200 Subject: [PATCH 18/31] Fix transformer weight --- src/tensors/treetransformers.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 50fc7e6d0..f0205cc6a 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -95,8 +95,11 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) return GenericTreeTransformer{T,N}(data) end +# Cost model for transforming a set of subblocks with fixed uncoupled sectors: +# L x L x size(subblock) where L is the number of subblocks +# this is L input blocks each going to L output blocks of given size function _transformer_weight((matrix, structures_dst, structures_src)) - return size(matrix, 1) * prod(structures_dst[1][1]) + return length(matrix) * prod(structures_dst[1][1]) end function buffersize(transformer::GenericTreeTransformer) From c9080150ae8c74ebfab2d520a0f6d3c6ab7463d5 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Thu, 3 Jul 2025 13:40:53 +0200 Subject: [PATCH 19/31] Handle empty tensors --- src/tensors/treetransformers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index f0205cc6a..06966c592 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -103,7 +103,7 @@ function _transformer_weight((matrix, structures_dst, structures_src)) end function buffersize(transformer::GenericTreeTransformer) - return maximum(transformer.data) do (basistransform, structures_dst, _) + return maximum(transformer.data; init=0) do (basistransform, structures_dst, _) return prod(structures_dst[1][1]) * max(size(basistransform)...) end end From 9e1d2e898b6db3638ed0fefbdec4bbcf5a4c3e31 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:11:26 -0400 Subject: [PATCH 20/31] update transformer weight --- src/tensors/treetransformers.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 06966c592..f921e8002 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -96,8 +96,10 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) end # Cost model for transforming a set of subblocks with fixed uncoupled sectors: -# L x L x size(subblock) where L is the number of subblocks -# this is L input blocks each going to L output blocks of given size +# L x L x length(subblock) where L is the number of subblocks +# this is L input blocks each going to L output blocks of given length +# Note that it might be the case that the permutations are dominant, in which case the +# actual cost model would scale like L x length(subblock) function _transformer_weight((matrix, structures_dst, structures_src)) return length(matrix) * prod(structures_dst[1][1]) end From 368001a57642961fc63c1111787846867e413af7 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:13:28 -0400 Subject: [PATCH 21/31] simplify buffersize --- src/tensors/treetransformers.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index f921e8002..b1b9474b4 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -106,7 +106,7 @@ end function buffersize(transformer::GenericTreeTransformer) return maximum(transformer.data; init=0) do (basistransform, structures_dst, _) - return prod(structures_dst[1][1]) * max(size(basistransform)...) + return prod(structures_dst[1][1]) * size(basistransform, 1) end end From d07ca899c88feea222baaecd2f2668515e8519ab Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:27:13 -0400 Subject: [PATCH 22/31] use type alias --- src/tensors/treetransformers.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index b1b9474b4..60684feb1 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -7,8 +7,10 @@ abstract type TreeTransformer end struct TrivialTreeTransformer <: TreeTransformer end +const _AbelianTransformerData{T,N} = Tuple{T,StridedStructure{N},StridedStructure{N}} + struct AbelianTreeTransformer{T,N} <: TreeTransformer - data::Vector{Tuple{T,StridedStructure{N},StridedStructure{N}}} + data::Vector{_AbelianTransformerData{T,N}} end function AbelianTreeTransformer(transform, p, Vdst, Vsrc) @@ -33,8 +35,11 @@ function AbelianTreeTransformer(transform, p, Vdst, Vsrc) return AbelianTreeTransformer(data) end +const _GenericTransformerData{T,N} = Tuple{Matrix{T},Vector{StridedStructure{N}}, + Vector{StridedStructure{N}}} + struct GenericTreeTransformer{T,N} <: TreeTransformer - data::Vector{Tuple{Matrix{T},Vector{StridedStructure{N}},Vector{StridedStructure{N}}}} + data::Vector{_GenericTransformerData{T,N}} end function GenericTreeTransformer(transform, p, Vdst, Vsrc) @@ -55,8 +60,7 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) T = sectorscalartype(I) N = numind(Vdst) L = length(uncoupleds_src_unique) - TStrided = StridedStructure{N} - data = Vector{Tuple{Matrix{T},Vector{TStrided},Vector{TStrided}}}(undef, L) + data = Vector{_GenericTransformerData{T,N}}(undef, L) # TODO: this can be multithreaded for (i, uncoupled) in enumerate(uncoupleds_src_unique) From 84f04abbe81d81e850fe9a0de42326c6c7d2d2b1 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:27:32 -0400 Subject: [PATCH 23/31] centralize sorting --- src/tensors/treetransformers.jl | 46 +++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 60684feb1..b112bee60 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -32,7 +32,12 @@ function AbelianTreeTransformer(transform, p, Vdst, Vsrc) data[i] = (coeff, stridestructure_dst, stridestructure_src) end - return AbelianTreeTransformer(data) + transformer = AbelianTreeTransformer(data) + + # sort by (approximate) weight to facilitate multi-threading strategies + # sort!(transformer) + + return transformer end const _GenericTransformerData{T,N} = Tuple{Matrix{T},Vector{StridedStructure{N}}, @@ -87,25 +92,17 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) structure_src.fusiontreestructure[ids_src]) end - # sort by (approximate) weight to make the buffers happy - # and use round-robin strategy for multi-threading - sort!(data; by=_transformer_weight, rev=true) - @debug("TreeTransformer for $Vsrc to $Vdst via $p", nblocks = length(data), sz_median = size(data[end ÷ 2][1], 1), sz_max = size(data[1][1], 1)) - return GenericTreeTransformer{T,N}(data) -end + transformer = GenericTreeTransformer{T,N}(data) -# Cost model for transforming a set of subblocks with fixed uncoupled sectors: -# L x L x length(subblock) where L is the number of subblocks -# this is L input blocks each going to L output blocks of given length -# Note that it might be the case that the permutations are dominant, in which case the -# actual cost model would scale like L x length(subblock) -function _transformer_weight((matrix, structures_dst, structures_src)) - return length(matrix) * prod(structures_dst[1][1]) + # sort by (approximate) weight to facilitate multi-threading strategies + # sort!(transformer) + + return transformer end function buffersize(transformer::GenericTreeTransformer) @@ -174,3 +171,24 @@ for (transform, treetransformer) in end # default cachestyle is GlobalLRUCache + +# Sorting based on cost model +# --------------------------- +function Base.sort!(transformer::Union{AbelianTreeTransformer,GenericTreeTransformer}; + by=_transformer_weight, rev::Bool=true) + sort!(transformer.data; by, rev) + return transformer +end + +function _transformer_weight((coeff, struct_dst, struct_src)::_AbelianTransformerData) + return prod(struct_dst[1]) +end + +# Cost model for transforming a set of subblocks with fixed uncoupled sectors: +# L x L x length(subblock) where L is the number of subblocks +# this is L input blocks each going to L output blocks of given length +# Note that it might be the case that the permutations are dominant, in which case the +# actual cost model would scale like L x length(subblock) +function _transformer_weight((mat, structs_dst, structs_src)::_GenericTransformerData) + return length(mat) * prod(structs_dst[1][1]) +end From e141021652503fd785ba94317020c86774c9dbf5 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:33:34 -0400 Subject: [PATCH 24/31] Add timing information to debug messages --- src/tensors/treetransformers.jl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index b112bee60..60df17e3f 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -14,6 +14,7 @@ struct AbelianTreeTransformer{T,N} <: TreeTransformer end function AbelianTreeTransformer(transform, p, Vdst, Vsrc) + t₀ = Base.time() permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) @@ -37,6 +38,10 @@ function AbelianTreeTransformer(transform, p, Vdst, Vsrc) # sort by (approximate) weight to facilitate multi-threading strategies # sort!(transformer) + Δt = Base.time() - t₀ + + @debug("Treetransformer for $Vsrc to $Vdst via $p", nblocks = L, Δt) + return transformer end @@ -48,6 +53,7 @@ struct GenericTreeTransformer{T,N} <: TreeTransformer end function GenericTreeTransformer(transform, p, Vdst, Vsrc) + t₀ = Base.time() permute(Vsrc, p) == Vdst || throw(SpaceMismatch("Incompatible spaces for permuting.")) structure_dst = fusionblockstructure(Vdst) structure_src = fusionblockstructure(Vsrc) @@ -92,15 +98,18 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) structure_src.fusiontreestructure[ids_src]) end - @debug("TreeTransformer for $Vsrc to $Vdst via $p", - nblocks = length(data), - sz_median = size(data[end ÷ 2][1], 1), - sz_max = size(data[1][1], 1)) - transformer = GenericTreeTransformer{T,N}(data) # sort by (approximate) weight to facilitate multi-threading strategies - # sort!(transformer) + sort!(transformer) + + Δt = Base.time() - t₀ + + @debug("TreeTransformer for $Vsrc to $Vdst via $p", + nblocks = length(data), + sz_median = size(data[end ÷ 2][1], 1), + sz_max = size(data[1][1], 1), + Δt) return transformer end From 0b13b7be32874699999ef21dc2b15a4636118475 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:34:57 -0400 Subject: [PATCH 25/31] Disable threading by default --- src/TensorKit.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TensorKit.jl b/src/TensorKit.jl index 9fa6237fb..255d6ba3a 100644 --- a/src/TensorKit.jl +++ b/src/TensorKit.jl @@ -233,8 +233,6 @@ include("auxiliary/deprecate.jl") # ---------- function __init__() @require_extensions - set_num_transformer_threads(Threads.nthreads()) - return nothing end end From 722610df3f7cabd9ab89fa8af8763a7875a1e7a4 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 12:58:28 -0400 Subject: [PATCH 26/31] Overhaul indexmanipulations to reuse multithreading criterion --- src/tensors/indexmanipulations.jl | 217 +++++++++++++----------------- 1 file changed, 96 insertions(+), 121 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index ec2bab6a4..769e84a7d 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -453,68 +453,71 @@ end function add_transform!(tdst::AbstractTensorMap, tsrc::AbstractTensorMap, - (p₁, p₂)::Index2Tuple, + p::Index2Tuple, transformer, α::Number, β::Number, backend::AbstractBackend...) @boundscheck begin - permute(space(tsrc), (p₁, p₂)) == space(tdst) || + permute(space(tsrc), p) == space(tdst) || throw(SpaceMismatch("source = $(codomain(tsrc))←$(domain(tsrc)), - dest = $(codomain(tdst))←$(domain(tdst)), p₁ = $(p₁), p₂ = $(p₂)")) + dest = $(codomain(tdst))←$(domain(tdst)), p₁ = $(p[1]), p₂ = $(p[2])")) end - if p₁ === codomainind(tsrc) && p₂ === domainind(tsrc) + if p[1] === codomainind(tsrc) && p[2] === domainind(tsrc) add!(tdst, tsrc, α, β) else - add_transform_kernel!(tdst, tsrc, (p₁, p₂), transformer, α, β, backend...) + I = sectortype(tdst) + if I === Trivial + _add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...) + elseif FusionStyle(I) === UniqueFusion() + if use_threaded_transform(tdst, transformer) + _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + else + _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, + backend...) + end + else + if use_threaded_transform(tdst, transformer) + _add_general_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + else + _add_general_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, + backend...) + end + end end return tdst end -function use_threaded_transform(t::TensorMap, transformer::TreeTransformer) +function use_threaded_transform(t::TensorMap, transformer) return get_num_transformer_threads() > 1 && length(t.data) > Strided.MINTHREADLENGTH end - -function add_transform_kernel!(tdst::TensorMap, - tsrc::TensorMap, - p::Index2Tuple, - transformer::TreeTransformer, - α::Number, - β::Number, - backend::AbstractBackend...) - if use_threaded_transform(tsrc, transformer) - _add_transform_threaded!(tdst, tsrc, p, transformer, α, β, backend...) - else - _add_transform_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) - end - - return nothing +function use_threaded_transform(t::AbstractTensorMap, transformer) + return get_num_transformer_threads() > 1 && dim(space(t)) > Strided.MINTHREADLENGTH end -# Trivial implementation -# ---------------------- -# Hijack before threading is used -function add_transform_kernel!(tdst::TensorMap, tsrc::TensorMap, (p₁, p₂)::Index2Tuple, - ::TrivialTreeTransformer, - α::Number, β::Number, backend::AbstractBackend...) - TO.tensoradd!(tdst[], tsrc[], (p₁, p₂), false, α, β, backend...) +# Trivial implementations +# ----------------------- +function _add_trivial_kernel!(tdst, tsrc, p, transformer, α, β, backend...) + TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...) return nothing end # Abelian implementations # ----------------------- -function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, - α, β, backend...) +function _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, + transformer::AbelianTreeTransformer, + α, β, backend...) for subtransformer in transformer.data _add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...) end return nothing end -function _add_transform_threaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, α, β, - backend...; ntasks::Int=get_num_transformer_threads()) +function _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer::AbelianTreeTransformer, + α, β, backend...; + ntasks::Int=get_num_transformer_threads()) nblocks = length(transformer.data) counter = Threads.Atomic{Int}(1) Threads.@sync for _ in 1:min(ntasks, nblocks) @@ -530,14 +533,49 @@ function _add_transform_threaded!(tdst, tsrc, p, transformer::AbelianTreeTransfo return nothing end +function _add_transform_single!(tdst, tsrc, p, + (basistransform, structures_dst, structures_src), + α, β, backend...) + structure_dst = structures_dst isa Vector ? only(structures_dst) : structures_dst + structure_src = structures_src isa Vector ? only(structures_src) : structures_src + coeff = basistransform isa Number ? basistransform : only(basistransform) + + subblock_dst = StridedView(tdst.data, structure_dst...) + subblock_src = StridedView(tsrc.data, structure_src...) + TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) + return nothing +end + +function _add_abelian_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) + for (f₁, f₂) in fusiontrees(tsrc) + _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, backend...) + end + return nothing +end + +function _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) + Threads.@sync for (f₁, f₂) in fusiontrees(tsrc) + Threads.@spawn _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, + backend...) + end + return nothing +end + +function _add_abelian_block!(tdst, tsrc, p, transformer, f₁, f₂, α, β, backend...) + (f₁′, f₂′), coeff = first(transformer(f₁, f₂)) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, + backend...) + return nothing +end + # Non-abelian implementations # --------------------------- -function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, - α, β, backend...) +function _add_general_kernel_nonthreaded!(tdst, tsrc, p, + transformer::GenericTreeTransformer, + α, β, backend...) # preallocate buffers buffers = allocate_buffers(tdst, tsrc, transformer) - # TODO: this could be multithreaded for subtransformer in transformer.data # Special case without intermediate buffers whenever there is only a single block if length(subtransformer[1]) == 1 @@ -549,9 +587,9 @@ function _add_transform_nonthreaded!(tdst, tsrc, p, transformer::GenericTreeTran return nothing end -function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, - α, β, backend...; - ntasks::Int=get_num_transformer_threads()) +function _add_general_kernel_threaded!(tdst, tsrc, p, transformer::GenericTreeTransformer, + α, β, backend...; + ntasks::Int=get_num_transformer_threads()) nblocks = length(transformer.data) counter = Threads.Atomic{Int}(1) @@ -577,18 +615,18 @@ function _add_transform_threaded!(tdst, tsrc, p, transformer::GenericTreeTransfo return nothing end -# Kernels -# ------- -function _add_transform_single!(tdst, tsrc, p, - (basistransform, structures_dst, structures_src), - α, β, backend...) - structure_dst = structures_dst isa Vector ? only(structures_dst) : structures_dst - structure_src = structures_src isa Vector ? only(structures_src) : structures_src - coeff = basistransform isa Number ? basistransform : only(basistransform) - - subblock_dst = StridedView(tdst.data, structure_dst...) - subblock_src = StridedView(tsrc.data, structure_src...) - TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) +function _add_general_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, backend...) + if iszero(β) + tdst = zerovector!(tdst) + elseif !isone(β) + tdst = scale!(tdst, β) + end + for (f₁, f₂) in fusiontrees(tsrc) + for ((f₁′, f₂′), coeff) in transformer(f₁, f₂) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, + One(), backend...) + end + end return nothing end @@ -620,88 +658,25 @@ function _add_transform_multi!(tdst, tsrc, p, return nothing end -# Other implementations -# --------------------- - -function add_transform_kernel!(tdst::AbstractTensorMap, - tsrc::AbstractTensorMap, - (p₁, p₂)::Index2Tuple, - fusiontreetransform::Function, - α::Number, - β::Number, - backend::AbstractBackend...) - I = sectortype(spacetype(tdst)) - - if I === Trivial - _add_trivial_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) - elseif FusionStyle(I) isa UniqueFusion - _add_abelian_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) - else - _add_general_kernel!(tdst, tsrc, (p₁, p₂), fusiontreetransform, α, β, backend...) - end - - return nothing -end - -# internal methods: no argument types -function _add_trivial_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) - TO.tensoradd!(tdst[], tsrc[], p, false, α, β, backend...) - return nothing -end - -function _add_abelian_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) - if Threads.nthreads() > 1 - Threads.@sync for (f₁, f₂) in fusiontrees(tsrc) - Threads.@spawn _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, - f₁, f₂, α, β, backend...) - end - else - for (f₁, f₂) in fusiontrees(tsrc) - _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, - f₁, f₂, α, β, backend...) - end - end - return nothing -end - -function _add_abelian_block!(tdst, tsrc, p, fusiontreetransform, f₁, f₂, α, β, backend...) - (f₁′, f₂′), coeff = first(fusiontreetransform(f₁, f₂)) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, - backend...) - return nothing -end - -function _add_general_kernel!(tdst, tsrc, p, fusiontreetransform, α, β, backend...) +function _add_general_kernel_threaded!(tdst, tsrc, p, transformer, α, β, backend...) if iszero(β) tdst = zerovector!(tdst) - elseif β != 1 + elseif !isone(β) tdst = scale!(tdst, β) end - β′ = One() - if Threads.nthreads() > 1 - Threads.@sync for s₁ in sectors(codomain(tsrc)), s₂ in sectors(domain(tsrc)) - Threads.@spawn _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, - s₂, α, β′, backend...) - end - else - for (f₁, f₂) in fusiontrees(tsrc) - for ((f₁′, f₂′), coeff) in fusiontreetransform(f₁, f₂) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, - β′, backend...) - end - end + Threads.@sync for s₁ in sectors(codomain(tsrc)), s₂ in sectors(domain(tsrc)) + Threads.@spawn _add_nonabelian_sector!(tdst, tsrc, p, transformer, s₁, s₂, α, + backend...) end return nothing end -# TODO: β argument is weird here because it has to be 1 -function _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, s₂, α, β, - backend...) +function _add_nonabelian_sector!(tdst, tsrc, p, fusiontreetransform, s₁, s₂, α, backend...) for (f₁, f₂) in fusiontrees(tsrc) (f₁.uncoupled == s₁ && f₂.uncoupled == s₂) || continue for ((f₁′, f₂′), coeff) in fusiontreetransform(f₁, f₂) - @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, β, - backend...) + @inbounds TO.tensoradd!(tdst[f₁′, f₂′], tsrc[f₁, f₂], p, false, α * coeff, + One(), backend...) end end return nothing From 99020108ed8def5f836701721567089304862569 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 13:01:53 -0400 Subject: [PATCH 27/31] remove unnecessary `@view` --- src/tensors/indexmanipulations.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 769e84a7d..8c94228dc 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -641,7 +641,7 @@ function _add_transform_multi!(tdst, tsrc, p, buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) for (i, structure_src) in enumerate(structures_src) subblock_src = StridedView(tsrc.data, structure_src...) - copyto!(@view(buffer_src[:, i]), subblock_src) + copyto!(buffer_src[:, i], subblock_src) end # Resummation into a second buffer using BLAS @@ -651,7 +651,7 @@ function _add_transform_multi!(tdst, tsrc, p, # Filling up the output for (i, structure_dst) in enumerate(structures_dst) subblock_dst = StridedView(tdst.data, structure_dst...) - bufblock_dst = sreshape(@view(buffer_dst[:, i]), sz_src) + bufblock_dst = sreshape(buffer_dst[:, i], sz_src) TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) end From 071a6b62bd10394c0461b2c6fbfe09f4c939e1a4 Mon Sep 17 00:00:00 2001 From: Lukas Devos Date: Mon, 7 Jul 2025 13:04:16 -0400 Subject: [PATCH 28/31] help strided implementation slightly --- src/tensors/indexmanipulations.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 8c94228dc..d67d770a7 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -641,7 +641,7 @@ function _add_transform_multi!(tdst, tsrc, p, buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) for (i, structure_src) in enumerate(structures_src) subblock_src = StridedView(tsrc.data, structure_src...) - copyto!(buffer_src[:, i], subblock_src) + copy!(sreshape(buffer_src[:, i], sz_src), subblock_src) end # Resummation into a second buffer using BLAS From 70caeb969a4df73f36fdfccf29066d01c004c137 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Tue, 8 Jul 2025 12:39:06 -0400 Subject: [PATCH 29/31] Avoid duplicate storage of sizes --- src/tensors/indexmanipulations.jl | 37 +++++++++++++++++++------------ src/tensors/treetransformers.jl | 24 ++++++++++++++------ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index d67d770a7..60e92b830 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -534,14 +534,10 @@ function _add_abelian_kernel_threaded!(tdst, tsrc, p, transformer::AbelianTreeTr end function _add_transform_single!(tdst, tsrc, p, - (basistransform, structures_dst, structures_src), + (coeff, struct_dst, struct_src)::_AbelianTransformerData, α, β, backend...) - structure_dst = structures_dst isa Vector ? only(structures_dst) : structures_dst - structure_src = structures_src isa Vector ? only(structures_src) : structures_src - coeff = basistransform isa Number ? basistransform : only(basistransform) - - subblock_dst = StridedView(tdst.data, structure_dst...) - subblock_src = StridedView(tsrc.data, structure_src...) + subblock_dst = StridedView(tdst.data, struct_dst...) + subblock_src = StridedView(tsrc.data, struct_src...) TO.tensoradd!(subblock_dst, subblock_src, p, false, α * coeff, β, backend...) return nothing end @@ -630,18 +626,31 @@ function _add_general_kernel_nonthreaded!(tdst, tsrc, p, transformer, α, β, ba return nothing end +function _add_transform_single!(tdst, tsrc, p, + (basistransform, structs_dst, + structs_src)::_GenericTransformerData, + α, β, backend...) + struct_dst = (structs_dst[1], only(structs_dst[2])...) + struct_src = (structs_src[1], only(structs_src[2])...) + coeff = only(basistransform) + _add_transform_single!(tdst, tsrc, p, (coeff, struct_dst, struct_src), α, β, backend...) + return nothing +end + function _add_transform_multi!(tdst, tsrc, p, - (basistransform, structures_dst, structures_src), + (basistransform, (sz_dst, structs_dst), + (sz_src, structs_src)), (buffer1, buffer2), α, β, backend...) rows, cols = size(basistransform) - sz_src = first(first(structures_src)) blocksize = prod(sz_src) + matsize = (prod(TupleTools.getindices(sz_src, codomainind(tsrc))), + prod(TupleTools.getindices(sz_src, domainind(tsrc)))) # Filling up a buffer with contiguous data buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) - for (i, structure_src) in enumerate(structures_src) - subblock_src = StridedView(tsrc.data, structure_src...) - copy!(sreshape(buffer_src[:, i], sz_src), subblock_src) + for (i, struct_src) in enumerate(structs_src) + subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize) + copyto!(buffer_src[:, i], subblock_src) end # Resummation into a second buffer using BLAS @@ -649,8 +658,8 @@ function _add_transform_multi!(tdst, tsrc, p, mul!(buffer_dst, buffer_src, basistransform, α, Zero()) # Filling up the output - for (i, structure_dst) in enumerate(structures_dst) - subblock_dst = StridedView(tdst.data, structure_dst...) + for (i, struct_dst) in enumerate(structs_dst) + subblock_dst = StridedView(tdst.data, sz_dst, struct_dst...) bufblock_dst = sreshape(buffer_dst[:, i], sz_src) TO.tensoradd!(subblock_dst, bufblock_dst, p, false, One(), β, backend...) end diff --git a/src/tensors/treetransformers.jl b/src/tensors/treetransformers.jl index 60df17e3f..7c349c885 100644 --- a/src/tensors/treetransformers.jl +++ b/src/tensors/treetransformers.jl @@ -45,8 +45,11 @@ function AbelianTreeTransformer(transform, p, Vdst, Vsrc) return transformer end -const _GenericTransformerData{T,N} = Tuple{Matrix{T},Vector{StridedStructure{N}}, - Vector{StridedStructure{N}}} +const _GenericTransformerData{T,N} = Tuple{Matrix{T}, + Tuple{NTuple{N,Int}, + Vector{Tuple{NTuple{N,Int},Int}}}, + Tuple{NTuple{N,Int}, + Vector{Tuple{NTuple{N,Int},Int}}}} struct GenericTreeTransformer{T,N} <: TreeTransformer data::Vector{_GenericTransformerData{T,N}} @@ -90,12 +93,19 @@ function GenericTreeTransformer(transform, p, Vdst, Vsrc) matrix[row, col] = coeff end end + + structs_src = structure_src.fusiontreestructure[ids_src] + sz_src = structs_src[1][1] + newstructs_src = map(x -> (x[2], x[3]), structs_src) + + structs_dst = structure_dst.fusiontreestructure[ids_dst] + sz_dst = structs_dst[1][1] + newstructs_dst = map(x -> (x[2], x[3]), structs_dst) + @debug("Created recoupling block for uncoupled: $uncoupled", sz = size(matrix), sparsity = count(!iszero, matrix) / length(matrix)) - data[i] = (matrix, - structure_dst.fusiontreestructure[ids_dst], - structure_src.fusiontreestructure[ids_src]) + data[i] = (matrix, (sz_dst, newstructs_dst), (sz_src, newstructs_src)) end transformer = GenericTreeTransformer{T,N}(data) @@ -116,7 +126,7 @@ end function buffersize(transformer::GenericTreeTransformer) return maximum(transformer.data; init=0) do (basistransform, structures_dst, _) - return prod(structures_dst[1][1]) * size(basistransform, 1) + return prod(structures_dst[1]) * size(basistransform, 1) end end @@ -199,5 +209,5 @@ end # Note that it might be the case that the permutations are dominant, in which case the # actual cost model would scale like L x length(subblock) function _transformer_weight((mat, structs_dst, structs_src)::_GenericTransformerData) - return length(mat) * prod(structs_dst[1][1]) + return length(mat) * prod(structs_dst[1]) end From 76b780514a541368312cca758a7c1e502de26de2 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Tue, 8 Jul 2025 12:41:31 -0400 Subject: [PATCH 30/31] Hand-crafted `copyto!` specialization --- src/auxiliary/auxiliary.jl | 24 ++++++++++++++++++++++++ src/tensors/indexmanipulations.jl | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index bda29d554..cb90589e0 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -55,3 +55,27 @@ _interleave(::Tuple{}, ::Tuple{}) = () function _interleave(a::NTuple{N}, b::NTuple{N}) where {N} return (a[1], b[1], _interleave(tail(a), tail(b))...) end + +# Low-overhead implementation of `copyto!` for specific case of `stride(B, 1) < stride(B, 2)` +# used in indexmanipulations: avoids the overhead of Strided.jl +function _copyto!(A::StridedView{<:Any,1}, B::StridedView{<:Any,2}) + length(A) == length(B) || throw(DimensionMismatch()) + + Adata = parent(A) + Astr = stride(A, 1) + IA = A.offset + + Bdata = parent(B) + Bstr = strides(B) + + IB_1 = B.offset + @inbounds for _ in axes(B, 2) + IB = IB_1 + for _ in axes(B, 1) + Adata[IA += Astr] = Bdata[IB += Bstr[1]] + end + IB_1 += Bstr[2] + end + + return A +end diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl index 60e92b830..c4e2b9228 100644 --- a/src/tensors/indexmanipulations.jl +++ b/src/tensors/indexmanipulations.jl @@ -650,7 +650,7 @@ function _add_transform_multi!(tdst, tsrc, p, buffer_src = StridedView(buffer2, (blocksize, cols), (1, blocksize), 0) for (i, struct_src) in enumerate(structs_src) subblock_src = sreshape(StridedView(tsrc.data, sz_src, struct_src...), matsize) - copyto!(buffer_src[:, i], subblock_src) + _copyto!(buffer_src[:, i], subblock_src) end # Resummation into a second buffer using BLAS From 1016b5a31f3827950c90923f5270ee38f477f305 Mon Sep 17 00:00:00 2001 From: lkdvos Date: Tue, 8 Jul 2025 15:11:11 -0400 Subject: [PATCH 31/31] Bump v0.14.7 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index e7ff1722c..6bb50f310 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TensorKit" uuid = "07d1fe3e-3e46-537d-9eac-e9e13d0d4cec" authors = ["Jutho Haegeman"] -version = "0.14.6" +version = "0.14.7" [deps] LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"