Skip to content

Commit 70fb567

Browse files
Move built-in AttributeTypes to separate builtin_types.py module
- Create builtin_types.py with DJBlobType, ContentType, XBlobType - Types serve as examples for users creating custom types - Module docstring includes example of defining a custom GraphType - Add get_adapter() function to attribute_type.py for compatibility - Auto-register built-in types via import at module load Co-authored-by: dimitri-yatsenko <dimitri@datajoint.com>
1 parent 3c4608f commit 70fb567

File tree

2 files changed

+260
-276
lines changed

2 files changed

+260
-276
lines changed

src/datajoint/attribute_type.py

Lines changed: 21 additions & 276 deletions
Original file line numberDiff line numberDiff line change
@@ -463,290 +463,35 @@ def resolve_dtype(
463463
return dtype, chain, store_name
464464

465465

466-
# =============================================================================
467-
# Built-in Attribute Types
468-
# =============================================================================
469-
470-
471-
class DJBlobType(AttributeType):
472-
"""
473-
Built-in type for DataJoint's native serialization format.
474-
475-
This type handles serialization of arbitrary Python objects (including NumPy arrays,
476-
dictionaries, lists, etc.) using DataJoint's binary blob format. The format includes:
477-
478-
- Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
479-
- Optional compression (zlib)
480-
- Support for NumPy arrays, datetime objects, UUIDs, and nested structures
481-
482-
The ``<djblob>`` type is the explicit way to specify DataJoint's serialization.
483-
It stores data in a MySQL ``LONGBLOB`` column.
484-
485-
Example:
486-
@schema
487-
class ProcessedData(dj.Manual):
488-
definition = '''
489-
data_id : int
490-
---
491-
results : <djblob> # Serialized Python objects
492-
raw_bytes : longblob # Raw bytes (no serialization)
493-
'''
494-
495-
Note:
496-
Plain ``longblob`` columns store and return raw bytes without serialization.
497-
Use ``<djblob>`` when you need automatic serialization of Python objects.
498-
Existing schemas using implicit blob serialization should migrate to ``<djblob>``
499-
using ``dj.migrate.migrate_blob_columns()``.
500-
"""
501-
502-
type_name = "djblob"
503-
dtype = "longblob"
504-
505-
def encode(self, value: Any, *, key: dict | None = None) -> bytes:
506-
"""
507-
Serialize a Python object to DataJoint's blob format.
508-
509-
Args:
510-
value: Any serializable Python object (dict, list, numpy array, etc.)
511-
key: Primary key values (unused for blob serialization).
512-
513-
Returns:
514-
Serialized bytes with protocol header and optional compression.
515-
"""
516-
from . import blob
517-
518-
return blob.pack(value, compress=True)
519-
520-
def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
521-
"""
522-
Deserialize DataJoint blob format back to a Python object.
523-
524-
Args:
525-
stored: Serialized blob bytes.
526-
key: Primary key values (unused for blob serialization).
527-
528-
Returns:
529-
The deserialized Python object.
530-
"""
531-
from . import blob
532-
533-
return blob.unpack(stored, squeeze=False)
534-
535-
536-
class DJBlobExternalType(AttributeType):
537-
"""
538-
Built-in type for externally-stored DataJoint blobs.
539-
540-
Similar to ``<djblob>`` but stores data in external blob storage instead
541-
of inline in the database. Useful for large objects.
542-
543-
The store name is specified when defining the column type.
544-
545-
Example:
546-
@schema
547-
class LargeData(dj.Manual):
548-
definition = '''
549-
data_id : int
550-
---
551-
large_array : blob@mystore # External storage with auto-serialization
552-
'''
466+
def get_adapter(context: dict | None, adapter_name: str) -> tuple[AttributeType, str | None]:
553467
"""
468+
Get an attribute type by name.
554469
555-
# Note: This type isn't directly usable via <djblob_external> syntax
556-
# It's used internally when blob@store syntax is detected
557-
type_name = "djblob_external"
558-
dtype = "blob@store" # Placeholder - actual store is determined at declaration time
559-
560-
def encode(self, value: Any, *, key: dict | None = None) -> bytes:
561-
"""Serialize a Python object to DataJoint's blob format."""
562-
from . import blob
563-
564-
return blob.pack(value, compress=True)
565-
566-
def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
567-
"""Deserialize DataJoint blob format back to a Python object."""
568-
from . import blob
569-
570-
return blob.unpack(stored, squeeze=False)
571-
572-
573-
class ContentType(AttributeType):
574-
"""
575-
Built-in type for content-addressed storage with deduplication.
576-
577-
The ``<content>`` type stores data using content-addressed storage. Data is
578-
identified by its SHA256 hash and stored in a hierarchical directory structure.
579-
Duplicate content is automatically deduplicated - storing the same bytes twice
580-
will only create one copy in storage.
581-
582-
The database column stores JSON metadata including the content hash, store name,
583-
and size. The actual content is stored in external storage.
584-
585-
This type is primarily used as a building block for other types like ``<xblob>``
586-
and ``<xattach>``, but can also be used directly for raw binary content.
587-
588-
Example:
589-
@schema
590-
class RawContent(dj.Manual):
591-
definition = '''
592-
content_id : int
593-
---
594-
data : <content@mystore> # Content-addressed storage
595-
'''
596-
597-
# Insert raw bytes
598-
table.insert1({'content_id': 1, 'data': b'raw binary content'})
599-
600-
# Fetch returns the original bytes
601-
data = (table & 'content_id=1').fetch1('data')
602-
assert data == b'raw binary content'
603-
604-
Storage Structure:
605-
Content is stored at: ``_content/{hash[:2]}/{hash[2:4]}/{hash}``
606-
This hierarchical structure prevents too many files in a single directory.
607-
608-
Note:
609-
The store parameter is required for ``<content>`` unless a default store
610-
is configured. Use ``<content@store_name>`` syntax to specify the store.
611-
"""
470+
This is a compatibility function used by heading and declare modules.
612471
613-
type_name = "content"
614-
dtype = "json"
615-
616-
def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
617-
"""
618-
Store content and return metadata.
619-
620-
Computes the SHA256 hash of the content and stores it using content-addressed
621-
storage. If content with the same hash already exists, it is not re-uploaded
622-
(deduplication).
623-
624-
Args:
625-
value: Raw bytes to store.
626-
key: Primary key values (unused for content storage).
627-
store_name: Store to use. If None, uses default store from config.
628-
629-
Returns:
630-
Metadata dict with keys: hash, store, size
631-
632-
Raises:
633-
TypeError: If value is not bytes.
634-
"""
635-
if not isinstance(value, bytes):
636-
raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
637-
638-
from .content_registry import put_content
639-
640-
return put_content(value, store_name=store_name)
641-
642-
def decode(self, stored: dict, *, key: dict | None = None) -> bytes:
643-
"""
644-
Retrieve content by its hash.
645-
646-
Args:
647-
stored: Metadata dict with 'hash' and optionally 'store' keys.
648-
key: Primary key values (unused for content retrieval).
649-
650-
Returns:
651-
The original bytes.
652-
653-
Raises:
654-
MissingExternalFile: If content is not found.
655-
DataJointError: If hash verification fails.
656-
"""
657-
from .content_registry import get_content
658-
659-
content_hash = stored["hash"]
660-
store_name = stored.get("store")
661-
return get_content(content_hash, store_name=store_name)
662-
663-
def validate(self, value: Any) -> None:
664-
"""Validate that value is bytes."""
665-
if not isinstance(value, bytes):
666-
raise TypeError(f"<content> type expects bytes, got {type(value).__name__}")
667-
668-
669-
class XBlobType(AttributeType):
670-
"""
671-
Built-in type for externally-stored serialized blobs with deduplication.
672-
673-
The ``<xblob>`` type combines DataJoint's blob serialization with content-addressed
674-
storage. Objects are serialized using the djblob format, then stored externally
675-
using content-addressed storage for automatic deduplication.
676-
677-
This type is ideal for large objects (NumPy arrays, pandas DataFrames, etc.)
678-
that may be duplicated across multiple rows.
679-
680-
Example:
681-
@schema
682-
class LargeArrays(dj.Manual):
683-
definition = '''
684-
array_id : int
685-
---
686-
data : <xblob@mystore> # External serialized blob with deduplication
687-
'''
688-
689-
# Insert NumPy array
690-
import numpy as np
691-
table.insert1({'array_id': 1, 'data': np.random.rand(1000, 1000)})
472+
Args:
473+
context: Ignored (legacy parameter, kept for API compatibility).
474+
adapter_name: The type name, with or without angle brackets.
475+
May include store parameter (e.g., "<xblob@cold>").
692476
693-
# Fetch returns the original array
694-
data = (table & 'array_id=1').fetch1('data')
477+
Returns:
478+
Tuple of (AttributeType instance, store_name or None).
695479
696-
Note:
697-
- For internal storage (in database), use ``<djblob>``
698-
- For external storage without serialization, use ``<content>``
699-
- The store parameter is required unless a default store is configured
480+
Raises:
481+
DataJointError: If the type is not found.
700482
"""
483+
type_name, store_name = parse_type_spec(adapter_name)
701484

702-
type_name = "xblob"
703-
dtype = "<content>" # Composition: uses ContentType for storage
704-
705-
def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
706-
"""
707-
Serialize a Python object to bytes.
708-
709-
The object is serialized using DataJoint's blob format. The resulting
710-
bytes are then passed to the underlying ``<content>`` type for storage.
711-
712-
Args:
713-
value: Any serializable Python object.
714-
key: Primary key values (unused).
715-
store_name: Store parameter (passed through to content storage).
716-
717-
Returns:
718-
Serialized bytes (will be stored by ContentType).
719-
"""
720-
from . import blob
721-
722-
return blob.pack(value, compress=True)
723-
724-
def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
725-
"""
726-
Deserialize bytes back to a Python object.
727-
728-
Args:
729-
stored: Serialized bytes retrieved from content storage.
730-
key: Primary key values (unused).
485+
if is_type_registered(type_name):
486+
return get_type(type_name), store_name
731487

732-
Returns:
733-
The deserialized Python object.
734-
"""
735-
from . import blob
488+
raise DataJointError(f"Attribute type <{type_name}> is not registered. " "Use @dj.register_type to register custom types.")
736489

737-
return blob.unpack(stored, squeeze=False)
738-
739-
740-
def _register_builtin_types() -> None:
741-
"""
742-
Register DataJoint's built-in attribute types.
743-
744-
Called automatically during module initialization.
745-
"""
746-
register_type(DJBlobType)
747-
register_type(ContentType)
748-
register_type(XBlobType)
749490

491+
# =============================================================================
492+
# Auto-register built-in types
493+
# =============================================================================
750494

751-
# Register built-in types when module is loaded
752-
_register_builtin_types()
495+
# Import builtin_types module to register built-in types (DJBlobType, ContentType, etc.)
496+
# This import has a side effect: it registers the types via @register_type decorators
497+
from . import builtin_types as _builtin_types # noqa: F401, E402

0 commit comments

Comments
 (0)