From 5af9cfd8c6608ec76dbeeeb782a70abd9b891660 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 17 Dec 2025 20:51:56 -0600 Subject: [PATCH 01/14] debug code on 1.12 --- test/runtests.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 9ca171f..74d629a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -45,7 +45,7 @@ struct CustomStruct2{sym} x::Int end -@testset "Arrow" begin +# @testset "Arrow" begin @testset "table roundtrips" begin for case in testtables testtable(case...) @@ -388,6 +388,7 @@ end )) tt = Arrow.Table(Arrow.tobuffer(t)) @test length(tt.a) == 12 + # Evaluated: [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 4] @test tt.a == [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5] t = Tables.partitioner(( @@ -1118,4 +1119,4 @@ end colmetadata!(df, :c, "cckey", "ccvalue") end end # @testset "DataAPI.metadata" -end +# end From 86c6005525d18a6610f73540b7454d2eef3cf31f Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 00:31:48 -0600 Subject: [PATCH 02/14] early return --- src/arraytypes/dictencoding.jl | 91 +++++++++++++++++----------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index fe48304..aec59b1 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -144,53 +144,54 @@ function arrowvector( id = x.encoding.id if !haskey(de, id) de[id] = Lockable(x.encoding) - else - encodinglockable = de[id] - Base.@lock encodinglockable begin - encoding = encodinglockable.value - # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized - deltas = setdiff(x.encoding, encoding) - if !isempty(deltas) - ET = indextype(encoding) - if length(deltas) + length(encoding) > typemax(ET) - error( - "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", - ) - end - data = arrowvector( - deltas, - i, - nl, - fi, - de, - ded, - nothing; - dictencode=dictencodenested, - dictencodenested=dictencodenested, - dictencoding=true, - kw..., + return x + end + + encodinglockable = de[id] + Base.@lock encodinglockable begin + encoding = encodinglockable.value + # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized + deltas = setdiff(x.encoding, encoding) + if !isempty(deltas) + ET = indextype(encoding) + if length(deltas) + length(encoding) > typemax(ET) + error( + "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", ) - push!( - ded, - DictEncoding{eltype(data),ET,typeof(data)}( - id, - data, - false, - getmetadata(data), - ), + end + data = arrowvector( + deltas, + i, + nl, + fi, + de, + ded, + nothing; + dictencode=dictencodenested, + dictencodenested=dictencodenested, + dictencoding=true, + kw..., + ) + push!( + ded, + DictEncoding{eltype(data),ET,typeof(data)}( + id, + data, + false, + getmetadata(data), + ), + ) + if typeof(encoding.data) <: ChainedVector + append!(encoding.data, data) + else + data2 = ChainedVector([encoding.data, data]) + encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( + id, + data2, + false, + getmetadata(encoding), ) - if typeof(encoding.data) <: ChainedVector - append!(encoding.data, data) - else - data2 = ChainedVector([encoding.data, data]) - encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( - id, - data2, - false, - getmetadata(encoding), - ) - de[id] = Lockable(encoding) - end + de[id] = Lockable(encoding) end end end From 58c1a9eadbda1698a1651410b4e166b392a10c53 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 10:25:55 -0600 Subject: [PATCH 03/14] dictencode --- src/arraytypes/dictencoding.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index aec59b1..c84a3bb 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -167,7 +167,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencodenested, + dictencode=dictencode, dictencodenested=dictencodenested, dictencoding=true, kw..., @@ -252,7 +252,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencodenested, + dictencode=dictencode, dictencodenested=dictencodenested, dictencoding=true, kw..., @@ -303,7 +303,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencodenested, + dictencode=dictencode, dictencodenested=dictencodenested, dictencoding=true, kw..., From c3dffb6165719ef167e1e130398c1d941efc2e13 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 11:06:26 -0600 Subject: [PATCH 04/14] ExtendedTestSet --- test/Project.toml | 2 ++ test/runtests.jl | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index 93977a9..1079926 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -31,6 +31,7 @@ SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TestSetExtensions = "98d24dd4-01ad-11ea-1b02-c9a08f80db04" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] @@ -44,4 +45,5 @@ PooledArrays = "1" StructTypes = "1" SentinelArrays = "1" Tables = "1" +TestSetExtensions = "3" TimeZones = "1" diff --git a/test/runtests.jl b/test/runtests.jl index 74d629a..2571cce 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,6 +28,7 @@ using DataAPI using FilePathsBase using DataFrames import Random: randstring +using TestSetExtensions: ExtendedTestSet include(joinpath(dirname(pathof(ArrowTypes)), "../test/tests.jl")) include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl")) @@ -45,7 +46,7 @@ struct CustomStruct2{sym} x::Int end -# @testset "Arrow" begin +@testset ExtendedTestSet "Arrow" begin @testset "table roundtrips" begin for case in testtables testtable(case...) @@ -1119,4 +1120,4 @@ end colmetadata!(df, :c, "cckey", "ccvalue") end end # @testset "DataAPI.metadata" -# end +end From b387e5ca56bf732d47db0a559f06792709a81a88 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 11:06:43 -0600 Subject: [PATCH 05/14] less indirection in loading additional test files --- test/runtests.jl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 2571cce..981e946 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,11 +30,14 @@ using DataFrames import Random: randstring using TestSetExtensions: ExtendedTestSet +# this formulation tests the loaded ArrowTypes, even if it's not the dev version +# within the mono-repo include(joinpath(dirname(pathof(ArrowTypes)), "../test/tests.jl")) -include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl")) -include(joinpath(dirname(pathof(Arrow)), "../test/testappend.jl")) -include(joinpath(dirname(pathof(Arrow)), "../test/integrationtest.jl")) -include(joinpath(dirname(pathof(Arrow)), "../test/dates.jl")) + +include(joinpath(@__DIR__, "testtables.jl")) +include(joinpath(@__DIR__, "testappend.jl")) +include(joinpath(@__DIR__, "integrationtest.jl")) +include(joinpath(@__DIR__, "dates.jl")) struct CustomStruct x::Int From 62c6a99045075082e3245af2ac86a912cd5d5c7e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 11:41:44 -0600 Subject: [PATCH 06/14] Revert "dictencode" This reverts commit 58c1a9eadbda1698a1651410b4e166b392a10c53. --- src/arraytypes/dictencoding.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index c84a3bb..aec59b1 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -167,7 +167,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencode, + dictencode=dictencodenested, dictencodenested=dictencodenested, dictencoding=true, kw..., @@ -252,7 +252,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencode, + dictencode=dictencodenested, dictencodenested=dictencodenested, dictencoding=true, kw..., @@ -303,7 +303,7 @@ function arrowvector( de, ded, nothing; - dictencode=dictencode, + dictencode=dictencodenested, dictencodenested=dictencodenested, dictencoding=true, kw..., From ac66cf7f525229adcfbf2664f87a5ed4666daa62 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 12:47:37 -0600 Subject: [PATCH 07/14] reuse the same lock --- src/arraytypes/dictencoding.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index aec59b1..69eccd0 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -191,7 +191,7 @@ function arrowvector( false, getmetadata(encoding), ) - de[id] = Lockable(encoding) + de[id] = Lockable(encoding, encodinglockable.lock) end end end @@ -327,7 +327,7 @@ function arrowvector( false, getmetadata(encoding), ) - de[id] = Lockable(encoding) + de[id] = Lockable(encoding, encodinglockable.lock) end end end From fb224349c9b76e64459960b801b97a4026d63eb6 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 31 Dec 2025 13:30:02 -0600 Subject: [PATCH 08/14] note on race condition --- src/arraytypes/dictencoding.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index 69eccd0..6cf8079 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -142,6 +142,8 @@ function arrowvector( kw..., ) id = x.encoding.id + # XXX This is a race condition if two workers hit this block at the same time, then they'll create + # distinct locks if !haskey(de, id) de[id] = Lockable(x.encoding) return x @@ -216,6 +218,8 @@ function arrowvector( x = x.data len = length(x) validity = ValidityBitmap(x) + # XXX This is a race condition if two workers hit this block at the same time, then they'll create + # distinct locks if !haskey(de, id) # dict encoding doesn't exist yet, so create for 1st time if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing From 75c06ec9125e785fc693df1d27bbd2cf077b76ac Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 11:38:50 -0600 Subject: [PATCH 09/14] disable multithreaded writes --- src/write.jl | 46 ++++++++++++++++++++++++---------------------- test/runtests.jl | 3 ++- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/write.jl b/src/write.jl index 1f1bfd1..325d0ab 100644 --- a/src/write.jl +++ b/src/write.jl @@ -295,27 +295,29 @@ function write(writer::Writer, source) recbatchmsg = makerecordbatchmsg(writer.schema[], cols, writer.alignment) put!(writer.msgs, recbatchmsg) else - if writer.threaded - @wkspawn process_partition( - tblcols, - writer.dictencodings, - writer.largelists, - writer.compress, - writer.denseunions, - writer.dictencode, - writer.dictencodenested, - writer.maxdepth, - writer.sync, - writer.msgs, - writer.alignment, - $(writer.partition_count), - writer.schema, - writer.errorref, - writer.anyerror, - writer.meta, - writer.colmeta, - ) - else + # XXX There is a race condition in the processing of dict encodings + # so we disable multithreaded writing until that can be addressed. See #582 + # if writer.threaded + # @wkspawn process_partition( + # tblcols, + # writer.dictencodings, + # writer.largelists, + # writer.compress, + # writer.denseunions, + # writer.dictencode, + # writer.dictencodenested, + # writer.maxdepth, + # writer.sync, + # writer.msgs, + # writer.alignment, + # $(writer.partition_count), + # writer.schema, + # writer.errorref, + # writer.anyerror, + # writer.meta, + # writer.colmeta, + # ) + # else @async process_partition( tblcols, writer.dictencodings, @@ -335,7 +337,7 @@ function write(writer::Writer, source) writer.meta, writer.colmeta, ) - end + # end end writer.partition_count += 1 end diff --git a/test/runtests.jl b/test/runtests.jl index 981e946..c715842 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -385,6 +385,8 @@ end end @testset "# 126" begin + # XXX This test also captures a race condition in multithreaded + # writes of dictionary encoded arrays t = Tables.partitioner(( (a=Arrow.toarrowvector(PooledArray([1, 2, 3])),), (a=Arrow.toarrowvector(PooledArray([1, 2, 3, 4])),), @@ -392,7 +394,6 @@ end )) tt = Arrow.Table(Arrow.tobuffer(t)) @test length(tt.a) == 12 - # Evaluated: [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 4] @test tt.a == [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5] t = Tables.partitioner(( From 29a04a6a149b842b7b9fc330ed71557a4102f699 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 12:07:00 -0600 Subject: [PATCH 10/14] format --- src/write.jl | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/write.jl b/src/write.jl index 325d0ab..4c3800f 100644 --- a/src/write.jl +++ b/src/write.jl @@ -318,25 +318,25 @@ function write(writer::Writer, source) # writer.colmeta, # ) # else - @async process_partition( - tblcols, - writer.dictencodings, - writer.largelists, - writer.compress, - writer.denseunions, - writer.dictencode, - writer.dictencodenested, - writer.maxdepth, - writer.sync, - writer.msgs, - writer.alignment, - $(writer.partition_count), - writer.schema, - writer.errorref, - writer.anyerror, - writer.meta, - writer.colmeta, - ) + @async process_partition( + tblcols, + writer.dictencodings, + writer.largelists, + writer.compress, + writer.denseunions, + writer.dictencode, + writer.dictencodenested, + writer.maxdepth, + writer.sync, + writer.msgs, + writer.alignment, + $(writer.partition_count), + writer.schema, + writer.errorref, + writer.anyerror, + writer.meta, + writer.colmeta, + ) # end end writer.partition_count += 1 From 7622b0989376b908b4e22733f929cf8054cae6b0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 15:19:25 -0600 Subject: [PATCH 11/14] revert unrelated changes --- src/arraytypes/dictencoding.jl | 93 +++++++++++++++++----------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/src/arraytypes/dictencoding.jl b/src/arraytypes/dictencoding.jl index 6cf8079..84d5105 100644 --- a/src/arraytypes/dictencoding.jl +++ b/src/arraytypes/dictencoding.jl @@ -146,54 +146,53 @@ function arrowvector( # distinct locks if !haskey(de, id) de[id] = Lockable(x.encoding) - return x - end - - encodinglockable = de[id] - Base.@lock encodinglockable begin - encoding = encodinglockable.value - # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized - deltas = setdiff(x.encoding, encoding) - if !isempty(deltas) - ET = indextype(encoding) - if length(deltas) + length(encoding) > typemax(ET) - error( - "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", + else + encodinglockable = de[id] + Base.@lock encodinglockable begin + encoding = encodinglockable.value + # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized + deltas = setdiff(x.encoding, encoding) + if !isempty(deltas) + ET = indextype(encoding) + if length(deltas) + length(encoding) > typemax(ET) + error( + "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", + ) + end + data = arrowvector( + deltas, + i, + nl, + fi, + de, + ded, + nothing; + dictencode=dictencodenested, + dictencodenested=dictencodenested, + dictencoding=true, + kw..., ) - end - data = arrowvector( - deltas, - i, - nl, - fi, - de, - ded, - nothing; - dictencode=dictencodenested, - dictencodenested=dictencodenested, - dictencoding=true, - kw..., - ) - push!( - ded, - DictEncoding{eltype(data),ET,typeof(data)}( - id, - data, - false, - getmetadata(data), - ), - ) - if typeof(encoding.data) <: ChainedVector - append!(encoding.data, data) - else - data2 = ChainedVector([encoding.data, data]) - encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( - id, - data2, - false, - getmetadata(encoding), + push!( + ded, + DictEncoding{eltype(data),ET,typeof(data)}( + id, + data, + false, + getmetadata(data), + ), ) - de[id] = Lockable(encoding, encodinglockable.lock) + if typeof(encoding.data) <: ChainedVector + append!(encoding.data, data) + else + data2 = ChainedVector([encoding.data, data]) + encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( + id, + data2, + false, + getmetadata(encoding), + ) + de[id] = Lockable(encoding) + end end end end @@ -331,7 +330,7 @@ function arrowvector( false, getmetadata(encoding), ) - de[id] = Lockable(encoding, encodinglockable.lock) + de[id] = Lockable(encoding) end end end From 58470ae8b3f763bfb1643ee51733efe74e432e96 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 16:16:45 -0600 Subject: [PATCH 12/14] reduce depth in nesting test --- test/runtests.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index c715842..aacbc69 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -607,15 +607,15 @@ end @test eltype(tbl.a) == Union{Int64,Missing} end - @testset "# 181" begin + # XXX this test hangs on Julia 1.12 when using a deeper nesting d = Dict{Int,Int}() - for i = 1:9 + for i in 1:1 d = Dict(i => d) end tbl = (x=[d],) - msg = "reached nested serialization level (20) deeper than provided max depth argument (19); to increase allowed nesting level, pass `maxdepth=X`" - @test_throws ErrorException(msg) Arrow.tobuffer(tbl; maxdepth=19) - @test Arrow.Table(Arrow.tobuffer(tbl; maxdepth=20)).x == tbl.x + msg = "reached nested serialization level (2) deeper than provided max depth argument (1); to increase allowed nesting level, pass `maxdepth=X`" + @test_throws ErrorException(msg) Arrow.tobuffer(tbl; maxdepth=1) + @test Arrow.Table(Arrow.tobuffer(tbl; maxdepth=5)).x == tbl.x end @testset "# 167" begin From f68a332118adc8eee52b0c0178fd7faf64fa9179 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 16:18:22 -0600 Subject: [PATCH 13/14] oops --- test/runtests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/runtests.jl b/test/runtests.jl index aacbc69..1b7eaf5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -607,6 +607,7 @@ end @test eltype(tbl.a) == Union{Int64,Missing} end + @testset "# 181" begin # XXX this test hangs on Julia 1.12 when using a deeper nesting d = Dict{Int,Int}() for i in 1:1 From db35d621a28d83b89ee9974ca0fe181e0bff55fa Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 6 Jan 2026 16:28:25 -0600 Subject: [PATCH 14/14] format --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 1b7eaf5..b068be1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -610,7 +610,7 @@ end @testset "# 181" begin # XXX this test hangs on Julia 1.12 when using a deeper nesting d = Dict{Int,Int}() - for i in 1:1 + for i = 1:1 d = Dict(i => d) end tbl = (x=[d],)