Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/deeplake_core/exceptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class invalid_video_compression : public exception
{
public:
explicit invalid_video_compression(const std::string& comp)
: exception(fmt::format("Provided image compression '{}' is not supported. Only mp4 are supported", comp))
: exception(fmt::format("Provided video compression '{}' is not supported. Only mp4 is supported", comp))
{
}
};
Expand Down
26 changes: 25 additions & 1 deletion cpp/deeplake_pg/extension_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,10 +803,34 @@ static void process_utility(PlannedStmt* pstmt,
case JSONBOID:
ds->add_column(column_name, deeplake_core::type::dict());
break;
case BYTEAOID:
case BYTEAOID: {
// Check for special domain types over BYTEA
if (pg::utils::is_file_domain_type(attr->atttypid)) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Domain type check is applied to wrong OID - should check base type, not the domain OID itself. Domain types inherit from BYTEA (base type), but attr->atttypid holds the domain OID. When checking is_file_domain_type(attr->atttypid), it will match correctly, but the parent case BYTEAOID: will never trigger for domain types because PostgreSQL doesn't set atttypid to BYTEAOID for domains.

Fix: Need to either check the base type of the domain first, or restructure to check domain types before the base type switch. Consider:

// Before the switch statement, check for domains over BYTEA
Oid baseType = getBaseType(attr->atttypid);
if (baseType == BYTEAOID) {
    if (pg::utils::is_file_domain_type(attr->atttypid)) { ... }
    // etc.
}

// FILE domain -> link of bytes
ds->add_column(
column_name,
deeplake_core::type::link(
deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte))));
break;
}
if (pg::utils::is_image_domain_type(attr->atttypid)) {
// IMAGE domain -> image type
ds->add_column(
column_name,
deeplake_core::type::image(
nd::type::array(nd::dtype::byte, 3), codecs::compression::null));
break;
}
if (pg::utils::is_video_domain_type(attr->atttypid)) {
// VIDEO domain -> video type
ds->add_column(column_name,
deeplake_core::type::video(codecs::compression::mp4));
break;
}
ds->add_column(column_name,
deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte)));
break;
}
case INT2ARRAYOID: {
int32_t ndims = (attr->attndims > 0) ? attr->attndims : 1;
if (ndims > 255) {
Expand Down
17 changes: 16 additions & 1 deletion cpp/deeplake_pg/nd_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,16 @@ inline std::string nd_to_pg_type(deeplake_core::type t)
case deeplake_core::type_kind::generic:
res = t.to_string();
break;
case deeplake_core::type_kind::link:
case deeplake_core::type_kind::link: {
// Check if link contains bytes (file type) vs other types
auto inner = t.as_link().get_type();
if (inner->is_generic() && inner->data_type().get_dtype() == nd::dtype::byte) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential null pointer dereference if get_type() returns nullptr. Should add null check before dereferencing.

Fix:

Suggested change
if (inner->is_generic() && inner->data_type().get_dtype() == nd::dtype::byte) {
auto inner = t.as_link().get_type();
if (inner && inner->is_generic() && inner->data_type().get_dtype() == nd::dtype::byte) {

res = "file";
} else {
res = "text";
}
break;
}
case deeplake_core::type_kind::text:
res = "text";
break;
Expand All @@ -336,6 +345,12 @@ inline std::string nd_to_pg_type(deeplake_core::type t)
res = "float4";
break;
case deeplake_core::type_kind::image:
res = "image";
break;
case deeplake_core::type_kind::video:
res = "video";
break;
case deeplake_core::type_kind::audio:
case deeplake_core::type_kind::bmask:
case deeplake_core::type_kind::smask:
case deeplake_core::type_kind::medical:
Expand Down
26 changes: 25 additions & 1 deletion cpp/deeplake_pg/table_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -762,10 +762,34 @@ void table_storage::create_table(const std::string& table_name, Oid table_id, Tu
case JSONBOID:
td.get_dataset()->add_column(column_name, deeplake_core::type::dict());
break;
case BYTEAOID:
case BYTEAOID: {
// Check for special domain types over BYTEA
if (pg::utils::is_file_domain_type(attr->atttypid)) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as extension_init.cpp:808 - domain type check won't work correctly because the case BYTEAOID: won't match domain types. Domain types have their own OID distinct from BYTEAOID.

Fix: Check the base type before entering the switch, or restructure to handle domains properly.

// FILE domain -> link of bytes
td.get_dataset()->add_column(
column_name,
deeplake_core::type::link(
deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte))));
break;
}
if (pg::utils::is_image_domain_type(attr->atttypid)) {
// IMAGE domain -> image type
td.get_dataset()->add_column(
column_name,
deeplake_core::type::image(
nd::type::array(nd::dtype::byte, 3), codecs::compression::null));
break;
}
if (pg::utils::is_video_domain_type(attr->atttypid)) {
// VIDEO domain -> video type
td.get_dataset()->add_column(column_name,
deeplake_core::type::video(codecs::compression::mp4));
break;
}
td.get_dataset()->add_column(column_name,
deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte)));
break;
}
case INT2ARRAYOID: {
int32_t ndims = (attr->attndims > 0) ? attr->attndims : 1;
if (ndims > 255) {
Expand Down
52 changes: 52 additions & 0 deletions cpp/deeplake_pg/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,58 @@ inline Oid get_base_array_element_type(Oid typid)
return InvalidOid;
}

/**
* @brief Check if a type OID is a domain type with the given name
* @param typid The type OID to check
* @param domain_name The domain name to match
* @return true if this is a domain type with the given name, false otherwise
*/
inline bool is_domain_type(Oid typid, const char* domain_name)
{
if (typid == InvalidOid) {
return false;
}

HeapTuple tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
if (!HeapTupleIsValid(tup)) {
return false;
}

Form_pg_type typTup = (Form_pg_type)GETSTRUCT(tup);

bool is_match = (typTup->typtype == TYPTYPE_DOMAIN && strcmp(NameStr(typTup->typname), domain_name) == 0);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Case-sensitive string comparison may cause issues if domains are created with different casing (e.g., "FILE" vs "file"). PostgreSQL identifiers are case-insensitive by default but stored in lowercase unless quoted.

Recommended fix: Use case-insensitive comparison or clarify that domains must be lowercase:

Suggested change
bool is_match = (typTup->typtype == TYPTYPE_DOMAIN && strcmp(NameStr(typTup->typname), domain_name) == 0);
bool is_match = (typTup->typtype == TYPTYPE_DOMAIN && pg_strcasecmp(NameStr(typTup->typname), domain_name) == 0);


ReleaseSysCache(tup);
return is_match;
}

/**
* @brief Check if a type OID is the FILE domain type
* FILE is a domain over BYTEA that represents link-of-bytes semantics.
*/
inline bool is_file_domain_type(Oid typid)
{
return is_domain_type(typid, "file");
}

/**
* @brief Check if a type OID is the IMAGE domain type
* IMAGE is a domain over BYTEA that represents image data.
*/
inline bool is_image_domain_type(Oid typid)
{
return is_domain_type(typid, "image");
}

/**
* @brief Check if a type OID is the VIDEO domain type
* VIDEO is a domain over BYTEA that represents video data.
*/
inline bool is_video_domain_type(Oid typid)
{
return is_domain_type(typid, "video");
}

/// Error handling wrapper
template <typename Func>
inline auto pg_try(Func&& f) -> decltype(f())
Expand Down
8 changes: 8 additions & 0 deletions postgres/pg_deeplake--1.0.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
CREATE DOMAIN IMAGE AS BYTEA;
COMMENT ON DOMAIN IMAGE IS 'Binary image data stored as BYTEA';

-- VIDEO domain: behaves like BYTEA but semantically represents video data
CREATE DOMAIN VIDEO AS BYTEA;
COMMENT ON DOMAIN VIDEO IS 'Binary video data stored as BYTEA';

-- FILE domain: behaves like BYTEA but semantically represents file data
CREATE DOMAIN FILE AS BYTEA;
COMMENT ON DOMAIN FILE IS 'Binary file data stored as BYTEA';

CREATE FUNCTION handle_index_creation() RETURNS event_trigger AS 'pg_deeplake' LANGUAGE C VOLATILE;

-- Create the event trigger to listen for CREATE INDEX events
Expand Down
189 changes: 189 additions & 0 deletions postgres/tests/py_tests/test_file_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""
Test FILE domain type (domain over BYTEA, maps to link-of-bytes in deeplake).
"""
import pytest
import asyncpg
from test_utils.assertions import Assertions


@pytest.mark.asyncio
async def test_file_domain(db_conn: asyncpg.Connection):
"""
Test FILE domain type for link-of-bytes data.

Tests:
- Creating FILE columns (domain over BYTEA)
- Inserting binary data as FILE
- NULL handling with FILE
- Bulk insert with FILE domain
- Exact matches on FILE columns
- octet_length() function on FILE
- FILE/BYTEA compatibility and casting
"""
assertions = Assertions(db_conn)

try:
# Create table with FILE domain columns
await db_conn.execute("""
CREATE TABLE file_test (
id INT,
document FILE,
attachment FILE
) USING deeplake
""")

# Test basic inserts with FILE domain (explicit cast)
await db_conn.execute("""
INSERT INTO file_test (id, document, attachment)
VALUES
(1, '\\x48656c6c6f'::FILE, '\\x576f726c64'::FILE),
(2, '\\xDEADBEEF'::FILE, '\\xCAFEBABE'::FILE)
""")

# Test inserts without explicit cast (BYTEA is implicitly accepted)
await db_conn.execute("""
INSERT INTO file_test (id, document, attachment)
VALUES
(3, '\\x00010203', '\\x04050607')
""")

# Test NULL handling
await db_conn.execute("""
INSERT INTO file_test (id, document, attachment)
VALUES
(4, NULL, '\\x08090A0B'::FILE),
(5, '\\x0C0D0E0F'::FILE, NULL),
(6, NULL, NULL)
""")

# Test bulk insert with FILE domain
await db_conn.execute("""
INSERT INTO file_test (id, document, attachment)
SELECT
i + 10,
('\\x' || lpad(to_hex(i), 8, '0'))::FILE,
('\\x' || lpad(to_hex(i * 2), 8, '0'))::FILE
FROM generate_series(1, 5) AS i
""")

# Test exact matches for FILE columns
await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE document = FROM_HEX('48656c6c6f')"
)

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE attachment = FROM_HEX('576f726c64')"
)

# Test NULL filtering
await assertions.assert_query_row_count(
2,
"SELECT * FROM file_test WHERE document IS NULL"
)

await assertions.assert_query_row_count(
2,
"SELECT * FROM file_test WHERE attachment IS NULL"
)

await assertions.assert_query_row_count(
9,
"SELECT * FROM file_test WHERE document IS NOT NULL"
)

await assertions.assert_query_row_count(
9,
"SELECT * FROM file_test WHERE attachment IS NOT NULL"
)

# Test that FILE behaves like BYTEA - octet_length function should work
await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE octet_length(document) = 5 AND id = 1"
)

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE octet_length(document) = 4 AND id = 2"
)

# Test that FILE and BYTEA are compatible (can compare/cast)
await db_conn.execute("SET pg_deeplake.use_deeplake_executor = off")

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE document = FROM_HEX('48656c6c6f')::BYTEA AND id = 1"
)

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_test WHERE document::BYTEA = FROM_HEX('48656c6c6f')::BYTEA AND id = 1"
)

await db_conn.execute("RESET pg_deeplake.use_deeplake_executor")

print("✓ Test passed: FILE domain type works correctly")

finally:
# Cleanup
await db_conn.execute("DROP TABLE IF EXISTS file_test CASCADE")
await db_conn.execute("RESET pg_deeplake.use_deeplake_executor")


@pytest.mark.asyncio
async def test_file_domain_alter_table(db_conn: asyncpg.Connection):
"""
Test ALTER TABLE ADD COLUMN with FILE domain type.
"""
assertions = Assertions(db_conn)

try:
# Create initial table
await db_conn.execute("""
CREATE TABLE file_alter_test (
id INT,
name TEXT
) USING deeplake
""")

# Insert initial data
await db_conn.execute("""
INSERT INTO file_alter_test (id, name)
VALUES (1, 'row1'), (2, 'row2'), (3, 'row3')
""")

# Add FILE column
await db_conn.execute("""
ALTER TABLE file_alter_test ADD COLUMN file_col FILE
""")

# Update with FILE data
await db_conn.execute("""
UPDATE file_alter_test SET file_col = '\\x48656c6c6f'::FILE WHERE id = 1
""")
await db_conn.execute("""
UPDATE file_alter_test SET file_col = '\\x576f726c64'::FILE WHERE id = 2
""")

# Verify
await assertions.assert_query_row_count(
2,
"SELECT * FROM file_alter_test WHERE file_col IS NOT NULL"
)

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_alter_test WHERE file_col IS NULL"
)

await assertions.assert_query_row_count(
1,
"SELECT * FROM file_alter_test WHERE file_col = FROM_HEX('48656c6c6f')"
)

print("✓ Test passed: ALTER TABLE ADD COLUMN with FILE domain works correctly")

finally:
await db_conn.execute("DROP TABLE IF EXISTS file_alter_test CASCADE")
Loading
Loading