-
Notifications
You must be signed in to change notification settings - Fork 705
Added image, video, file domain types. #3129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -324,7 +324,16 @@ inline std::string nd_to_pg_type(deeplake_core::type t) | |||||||
| case deeplake_core::type_kind::generic: | ||||||||
| res = t.to_string(); | ||||||||
| break; | ||||||||
| case deeplake_core::type_kind::link: | ||||||||
| case deeplake_core::type_kind::link: { | ||||||||
| // Check if link contains bytes (file type) vs other types | ||||||||
| auto inner = t.as_link().get_type(); | ||||||||
| if (inner->is_generic() && inner->data_type().get_dtype() == nd::dtype::byte) { | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential null pointer dereference if Fix:
Suggested change
|
||||||||
| res = "file"; | ||||||||
| } else { | ||||||||
| res = "text"; | ||||||||
| } | ||||||||
| break; | ||||||||
| } | ||||||||
| case deeplake_core::type_kind::text: | ||||||||
| res = "text"; | ||||||||
| break; | ||||||||
|
|
@@ -336,6 +345,12 @@ inline std::string nd_to_pg_type(deeplake_core::type t) | |||||||
| res = "float4"; | ||||||||
| break; | ||||||||
| case deeplake_core::type_kind::image: | ||||||||
| res = "image"; | ||||||||
| break; | ||||||||
| case deeplake_core::type_kind::video: | ||||||||
| res = "video"; | ||||||||
| break; | ||||||||
| case deeplake_core::type_kind::audio: | ||||||||
| case deeplake_core::type_kind::bmask: | ||||||||
| case deeplake_core::type_kind::smask: | ||||||||
| case deeplake_core::type_kind::medical: | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -762,10 +762,34 @@ void table_storage::create_table(const std::string& table_name, Oid table_id, Tu | |
| case JSONBOID: | ||
| td.get_dataset()->add_column(column_name, deeplake_core::type::dict()); | ||
| break; | ||
| case BYTEAOID: | ||
| case BYTEAOID: { | ||
| // Check for special domain types over BYTEA | ||
| if (pg::utils::is_file_domain_type(attr->atttypid)) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same issue as extension_init.cpp:808 - domain type check won't work correctly because the Fix: Check the base type before entering the switch, or restructure to handle domains properly. |
||
| // FILE domain -> link of bytes | ||
| td.get_dataset()->add_column( | ||
| column_name, | ||
| deeplake_core::type::link( | ||
| deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte)))); | ||
| break; | ||
| } | ||
| if (pg::utils::is_image_domain_type(attr->atttypid)) { | ||
| // IMAGE domain -> image type | ||
| td.get_dataset()->add_column( | ||
| column_name, | ||
| deeplake_core::type::image( | ||
| nd::type::array(nd::dtype::byte, 3), codecs::compression::null)); | ||
| break; | ||
| } | ||
| if (pg::utils::is_video_domain_type(attr->atttypid)) { | ||
| // VIDEO domain -> video type | ||
| td.get_dataset()->add_column(column_name, | ||
| deeplake_core::type::video(codecs::compression::mp4)); | ||
| break; | ||
| } | ||
| td.get_dataset()->add_column(column_name, | ||
| deeplake_core::type::generic(nd::type::scalar(nd::dtype::byte))); | ||
| break; | ||
| } | ||
| case INT2ARRAYOID: { | ||
| int32_t ndims = (attr->attndims > 0) ? attr->attndims : 1; | ||
| if (ndims > 255) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -110,6 +110,58 @@ inline Oid get_base_array_element_type(Oid typid) | |||||
| return InvalidOid; | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * @brief Check if a type OID is a domain type with the given name | ||||||
| * @param typid The type OID to check | ||||||
| * @param domain_name The domain name to match | ||||||
| * @return true if this is a domain type with the given name, false otherwise | ||||||
| */ | ||||||
| inline bool is_domain_type(Oid typid, const char* domain_name) | ||||||
| { | ||||||
| if (typid == InvalidOid) { | ||||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| HeapTuple tup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid)); | ||||||
| if (!HeapTupleIsValid(tup)) { | ||||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| Form_pg_type typTup = (Form_pg_type)GETSTRUCT(tup); | ||||||
|
|
||||||
| bool is_match = (typTup->typtype == TYPTYPE_DOMAIN && strcmp(NameStr(typTup->typname), domain_name) == 0); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Case-sensitive string comparison may cause issues if domains are created with different casing (e.g., "FILE" vs "file"). PostgreSQL identifiers are case-insensitive by default but stored in lowercase unless quoted. Recommended fix: Use case-insensitive comparison or clarify that domains must be lowercase:
Suggested change
|
||||||
|
|
||||||
| ReleaseSysCache(tup); | ||||||
| return is_match; | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * @brief Check if a type OID is the FILE domain type | ||||||
| * FILE is a domain over BYTEA that represents link-of-bytes semantics. | ||||||
| */ | ||||||
| inline bool is_file_domain_type(Oid typid) | ||||||
| { | ||||||
| return is_domain_type(typid, "file"); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * @brief Check if a type OID is the IMAGE domain type | ||||||
| * IMAGE is a domain over BYTEA that represents image data. | ||||||
| */ | ||||||
| inline bool is_image_domain_type(Oid typid) | ||||||
| { | ||||||
| return is_domain_type(typid, "image"); | ||||||
| } | ||||||
|
|
||||||
| /** | ||||||
| * @brief Check if a type OID is the VIDEO domain type | ||||||
| * VIDEO is a domain over BYTEA that represents video data. | ||||||
| */ | ||||||
| inline bool is_video_domain_type(Oid typid) | ||||||
| { | ||||||
| return is_domain_type(typid, "video"); | ||||||
| } | ||||||
|
|
||||||
| /// Error handling wrapper | ||||||
| template <typename Func> | ||||||
| inline auto pg_try(Func&& f) -> decltype(f()) | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,189 @@ | ||
| """ | ||
| Test FILE domain type (domain over BYTEA, maps to link-of-bytes in deeplake). | ||
| """ | ||
| import pytest | ||
| import asyncpg | ||
| from test_utils.assertions import Assertions | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_file_domain(db_conn: asyncpg.Connection): | ||
| """ | ||
| Test FILE domain type for link-of-bytes data. | ||
|
|
||
| Tests: | ||
| - Creating FILE columns (domain over BYTEA) | ||
| - Inserting binary data as FILE | ||
| - NULL handling with FILE | ||
| - Bulk insert with FILE domain | ||
| - Exact matches on FILE columns | ||
| - octet_length() function on FILE | ||
| - FILE/BYTEA compatibility and casting | ||
| """ | ||
| assertions = Assertions(db_conn) | ||
|
|
||
| try: | ||
| # Create table with FILE domain columns | ||
| await db_conn.execute(""" | ||
| CREATE TABLE file_test ( | ||
| id INT, | ||
| document FILE, | ||
| attachment FILE | ||
| ) USING deeplake | ||
| """) | ||
|
|
||
| # Test basic inserts with FILE domain (explicit cast) | ||
| await db_conn.execute(""" | ||
| INSERT INTO file_test (id, document, attachment) | ||
| VALUES | ||
| (1, '\\x48656c6c6f'::FILE, '\\x576f726c64'::FILE), | ||
| (2, '\\xDEADBEEF'::FILE, '\\xCAFEBABE'::FILE) | ||
| """) | ||
|
|
||
| # Test inserts without explicit cast (BYTEA is implicitly accepted) | ||
| await db_conn.execute(""" | ||
| INSERT INTO file_test (id, document, attachment) | ||
| VALUES | ||
| (3, '\\x00010203', '\\x04050607') | ||
| """) | ||
|
|
||
| # Test NULL handling | ||
| await db_conn.execute(""" | ||
| INSERT INTO file_test (id, document, attachment) | ||
| VALUES | ||
| (4, NULL, '\\x08090A0B'::FILE), | ||
| (5, '\\x0C0D0E0F'::FILE, NULL), | ||
| (6, NULL, NULL) | ||
| """) | ||
|
|
||
| # Test bulk insert with FILE domain | ||
| await db_conn.execute(""" | ||
| INSERT INTO file_test (id, document, attachment) | ||
| SELECT | ||
| i + 10, | ||
| ('\\x' || lpad(to_hex(i), 8, '0'))::FILE, | ||
| ('\\x' || lpad(to_hex(i * 2), 8, '0'))::FILE | ||
| FROM generate_series(1, 5) AS i | ||
| """) | ||
|
|
||
| # Test exact matches for FILE columns | ||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE document = FROM_HEX('48656c6c6f')" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE attachment = FROM_HEX('576f726c64')" | ||
| ) | ||
|
|
||
| # Test NULL filtering | ||
| await assertions.assert_query_row_count( | ||
| 2, | ||
| "SELECT * FROM file_test WHERE document IS NULL" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 2, | ||
| "SELECT * FROM file_test WHERE attachment IS NULL" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 9, | ||
| "SELECT * FROM file_test WHERE document IS NOT NULL" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 9, | ||
| "SELECT * FROM file_test WHERE attachment IS NOT NULL" | ||
| ) | ||
|
|
||
| # Test that FILE behaves like BYTEA - octet_length function should work | ||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE octet_length(document) = 5 AND id = 1" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE octet_length(document) = 4 AND id = 2" | ||
| ) | ||
|
|
||
| # Test that FILE and BYTEA are compatible (can compare/cast) | ||
| await db_conn.execute("SET pg_deeplake.use_deeplake_executor = off") | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE document = FROM_HEX('48656c6c6f')::BYTEA AND id = 1" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_test WHERE document::BYTEA = FROM_HEX('48656c6c6f')::BYTEA AND id = 1" | ||
| ) | ||
|
|
||
| await db_conn.execute("RESET pg_deeplake.use_deeplake_executor") | ||
|
|
||
| print("✓ Test passed: FILE domain type works correctly") | ||
|
|
||
| finally: | ||
| # Cleanup | ||
| await db_conn.execute("DROP TABLE IF EXISTS file_test CASCADE") | ||
| await db_conn.execute("RESET pg_deeplake.use_deeplake_executor") | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_file_domain_alter_table(db_conn: asyncpg.Connection): | ||
| """ | ||
| Test ALTER TABLE ADD COLUMN with FILE domain type. | ||
| """ | ||
| assertions = Assertions(db_conn) | ||
|
|
||
| try: | ||
| # Create initial table | ||
| await db_conn.execute(""" | ||
| CREATE TABLE file_alter_test ( | ||
| id INT, | ||
| name TEXT | ||
| ) USING deeplake | ||
| """) | ||
|
|
||
| # Insert initial data | ||
| await db_conn.execute(""" | ||
| INSERT INTO file_alter_test (id, name) | ||
| VALUES (1, 'row1'), (2, 'row2'), (3, 'row3') | ||
| """) | ||
|
|
||
| # Add FILE column | ||
| await db_conn.execute(""" | ||
| ALTER TABLE file_alter_test ADD COLUMN file_col FILE | ||
| """) | ||
|
|
||
| # Update with FILE data | ||
| await db_conn.execute(""" | ||
| UPDATE file_alter_test SET file_col = '\\x48656c6c6f'::FILE WHERE id = 1 | ||
| """) | ||
| await db_conn.execute(""" | ||
| UPDATE file_alter_test SET file_col = '\\x576f726c64'::FILE WHERE id = 2 | ||
| """) | ||
|
|
||
| # Verify | ||
| await assertions.assert_query_row_count( | ||
| 2, | ||
| "SELECT * FROM file_alter_test WHERE file_col IS NOT NULL" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_alter_test WHERE file_col IS NULL" | ||
| ) | ||
|
|
||
| await assertions.assert_query_row_count( | ||
| 1, | ||
| "SELECT * FROM file_alter_test WHERE file_col = FROM_HEX('48656c6c6f')" | ||
| ) | ||
|
|
||
| print("✓ Test passed: ALTER TABLE ADD COLUMN with FILE domain works correctly") | ||
|
|
||
| finally: | ||
| await db_conn.execute("DROP TABLE IF EXISTS file_alter_test CASCADE") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Domain type check is applied to wrong OID - should check base type, not the domain OID itself. Domain types inherit from BYTEA (base type), but
attr->atttypidholds the domain OID. When checkingis_file_domain_type(attr->atttypid), it will match correctly, but the parentcase BYTEAOID:will never trigger for domain types because PostgreSQL doesn't setatttypidto BYTEAOID for domains.Fix: Need to either check the base type of the domain first, or restructure to check domain types before the base type switch. Consider: