datajoint
diff --git a/‎src/datajoint/builtin_codecs.py‎
Lines changed: 0 additions & 1286 deletions b/‎src/datajoint/builtin_codecs.py‎
Lines changed: 0 additions & 1286 deletions
diff --git a/‎src/datajoint/builtin_codecs/__init__.py‎
Lines changed: 77 additions & 0 deletions b/‎src/datajoint/builtin_codecs/__init__.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎src/datajoint/builtin_codecs/attach.py‎
Lines changed: 136 additions & 0 deletions b/‎src/datajoint/builtin_codecs/attach.py‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎src/datajoint/builtin_codecs/blob.py‎
Lines changed: 61 additions & 0 deletions b/‎src/datajoint/builtin_codecs/blob.py‎
Lines changed: 61 additions & 0 deletions
@@ -0,0 +1,77 @@
+"""
+Built-in DataJoint codecs.
+
+This package defines the standard codecs that ship with DataJoint.
+These serve as both useful built-in codecs and as examples for users who
+want to create their own custom codecs.
+
+Built-in Codecs:
+    - ``<blob>``: Serialize Python objects (in-table storage)
+    - ``<blob@>``: Serialize Python objects (external with hash-addressed dedup)
+    - ``<attach>``: File attachment (in-table storage)
+    - ``<attach@>``: File attachment (external with hash-addressed dedup)
+    - ``<hash@>``: Hash-addressed storage with MD5 deduplication (external only)
+    - ``<object@>``: Schema-addressed storage for files/folders (external only)
+    - ``<npy@>``: Store numpy arrays as portable .npy files (external only)
+    - ``<filepath@store>``: Reference to existing file in store (external only)
+
+Example - Creating a Custom Codec:
+    Here's how to define your own codec, modeled after the built-in codecs::
+
+        import datajoint as dj
+        import networkx as nx
+
+        class GraphCodec(dj.Codec):
+            '''Store NetworkX graphs as edge lists.'''
+
+            name = "graph"  # Use as <graph> in definitions
+
+            def get_dtype(self, is_store: bool) -> str:
+                return "<blob>"  # Compose with blob for serialization
+
+            def encode(self, graph, *, key=None, store_name=None):
+                # Convert graph to a serializable format
+                return {
+                    'nodes': list(graph.nodes(data=True)),
+                    'edges': list(graph.edges(data=True)),
+                }
+
+            def decode(self, stored, *, key=None):
+                # Reconstruct graph from stored format
+                G = nx.Graph()
+                G.add_nodes_from(stored['nodes'])
+                G.add_edges_from(stored['edges'])
+                return G
+
+            def validate(self, value):
+                if not isinstance(value, nx.Graph):
+                    raise TypeError(f"Expected nx.Graph, got {type(value).__name__}")
+
+        # Now use in table definitions:
+        @schema
+        class Networks(dj.Manual):
+            definition = '''
+            network_id : int
+            ---
+            topology : <graph>
+            '''
+"""
+
+from .attach import AttachCodec
+from .blob import BlobCodec
+from .filepath import FilepathCodec
+from .hash import HashCodec
+from .npy import NpyCodec, NpyRef
+from .object import ObjectCodec
+from .schema import SchemaCodec
+
+__all__ = [
+    "BlobCodec",
+    "HashCodec",
+    "SchemaCodec",
+    "ObjectCodec",
+    "AttachCodec",
+    "FilepathCodec",
+    "NpyCodec",
+    "NpyRef",
+]
@@ -0,0 +1,136 @@
+"""
+File attachment codec with filename preservation.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..codecs import Codec
+
+
+class AttachCodec(Codec):
+    """
+    File attachment with filename preserved.
+
+    Supports both in-table and in-store storage:
+    - ``<attach>``: Stored in database (bytes → LONGBLOB)
+    - ``<attach@>``: Stored in object store via ``<hash@>`` with deduplication
+    - ``<attach@store>``: Stored in specific named store
+
+    The filename is preserved and the file is extracted to the configured
+    download path on fetch.
+
+    Example::
+
+        @schema
+        class Documents(dj.Manual):
+            definition = '''
+            doc_id : int
+            ---
+            config : <attach>           # in-table (small file in DB)
+            dataset : <attach@>         # in-store (default store)
+            archive : <attach@cold>     # in-store (specific store)
+            '''
+
+        # Insert a file
+        table.insert1({'doc_id': 1, 'config': '/path/to/config.json'})
+
+        # Fetch extracts to download_path and returns local path
+        local_path = (table & 'doc_id=1').fetch1('config')
+
+    Storage Format (internal):
+        The blob contains: ``filename\\0contents``
+        - Filename (UTF-8 encoded) + null byte + raw file contents
+    """
+
+    name = "attach"
+
+    def get_dtype(self, is_store: bool) -> str:
+        """Return bytes for in-table, <hash> for in-store storage."""
+        return "<hash>" if is_store else "bytes"
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """
+        Read file and encode as filename + contents.
+
+        Parameters
+        ----------
+        value : str or Path
+            Path to file.
+        key : dict, optional
+            Primary key values (unused).
+        store_name : str, optional
+            Unused for internal storage.
+
+        Returns
+        -------
+        bytes
+            Filename (UTF-8) + null byte + file contents.
+        """
+        from pathlib import Path
+
+        path = Path(value)
+        if not path.exists():
+            raise FileNotFoundError(f"Attachment file not found: {path}")
+        if path.is_dir():
+            raise IsADirectoryError(f"<attach> does not support directories: {path}")
+
+        filename = path.name
+        contents = path.read_bytes()
+        return filename.encode("utf-8") + b"\x00" + contents
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> str:
+        """
+        Extract file to download path and return local path.
+
+        Parameters
+        ----------
+        stored : bytes
+            Blob containing filename + null + contents.
+        key : dict, optional
+            Primary key values (unused).
+
+        Returns
+        -------
+        str
+            Path to extracted file.
+        """
+        from pathlib import Path
+
+        from ..settings import config
+
+        # Split on first null byte
+        null_pos = stored.index(b"\x00")
+        filename = stored[:null_pos].decode("utf-8")
+        contents = stored[null_pos + 1 :]
+
+        # Write to download path
+        download_path = Path(config.get("download_path", "."))
+        download_path.mkdir(parents=True, exist_ok=True)
+        local_path = download_path / filename
+
+        # Handle filename collision - if file exists with different content, add suffix
+        if local_path.exists():
+            existing_contents = local_path.read_bytes()
+            if existing_contents != contents:
+                # Find unique filename
+                stem = local_path.stem
+                suffix = local_path.suffix
+                counter = 1
+                while local_path.exists() and local_path.read_bytes() != contents:
+                    local_path = download_path / f"{stem}_{counter}{suffix}"
+                    counter += 1
+
+        # Only write if file doesn't exist or has different content
+        if not local_path.exists():
+            local_path.write_bytes(contents)
+
+        return str(local_path)
+
+    def validate(self, value: Any) -> None:
+        """Validate that value is a valid file path."""
+        from pathlib import Path
+
+        if not isinstance(value, (str, Path)):
+            raise TypeError(f"<attach> expects a file path, got {type(value).__name__}")
@@ -0,0 +1,61 @@
+"""
+Blob codec for Python object serialization.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from ..codecs import Codec
+
+
+class BlobCodec(Codec):
+    """
+    Serialize Python objects using DataJoint's blob format.
+
+    The ``<blob>`` codec handles serialization of arbitrary Python objects
+    including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs.
+
+    Supports both in-table and in-store storage:
+    - ``<blob>``: Stored in database (bytes → LONGBLOB)
+    - ``<blob@>``: Stored in object store via ``<hash@>`` with deduplication
+    - ``<blob@store>``: Stored in specific named store
+
+    Format Features:
+        - Protocol headers (``mYm`` for MATLAB-compatible, ``dj0`` for Python-native)
+        - Optional zlib compression for data > 1KB
+        - Support for nested structures
+
+    Example::
+
+        @schema
+        class ProcessedData(dj.Manual):
+            definition = '''
+            data_id : int
+            ---
+            small_result : <blob>       # in-table (in database)
+            large_result : <blob@>      # in-store (default store)
+            archive : <blob@cold>       # in-store (specific store)
+            '''
+
+        # Insert any serializable object
+        table.insert1({'data_id': 1, 'small_result': {'scores': [0.9, 0.8]}})
+    """
+
+    name = "blob"
+
+    def get_dtype(self, is_store: bool) -> str:
+        """Return bytes for in-table, <hash> for in-store storage."""
+        return "<hash>" if is_store else "bytes"
+
+    def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
+        """Serialize a Python object to DataJoint's blob format."""
+        from .. import blob
+
+        return blob.pack(value, compress=True)
+
+    def decode(self, stored: bytes, *, key: dict | None = None) -> Any:
+        """Deserialize blob bytes back to a Python object."""
+        from .. import blob
+
+        return blob.unpack(stored, squeeze=False)